1 年間前 · 2c2fcd14e4
--- a/eval/README.md
+++ b/eval/README.md
--- a/eval/boolq_new.yaml
+++ b/eval/boolq_new.yaml
@@ -0,0 +1,13 @@
 
				+task: demo_boolq_5
			
 
				+dataset_path: super_glue
			
 
				+dataset_name: boolq
			
 
				+output_type: multiple_choice
			
 
				+training_split: train
			
 
				+validation_split: validation
			
 
				+doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
			
 
				+doc_to_target: label
			
 
				+doc_to_choice: ["no", "yes"]
			
 
				+should_decontaminate: true
			
 
				+doc_to_decontamination_query: passage
			
 
				+metric_list:
			
 
				+  - metric: acc
			
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -0,0 +1,137 @@
 
				+import argparse
			
 
				+import json
			
 
				+import logging
			
 
				+import os
			
 
				+import re
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import numpy as np
			
 
				+import lm_eval
			
 
				+from lm_eval import evaluator, tasks
			
 
				+from lm_eval.utils import make_table, load_yaml_config
			
 
				+
			
 
				+
			
 
				+def _handle_non_serializable(o):
			
 
				+    if isinstance(o, np.int64) or isinstance(o, np.int32):
			
 
				+        return int(o)
			
 
				+    elif isinstance(o, set):
			
 
				+        return list(o)
			
 
				+    else:
			
 
				+        return str(o)
			
 
				+
			
 
				+def setup_logging(verbosity):
			
 
				+    logging.basicConfig(level=verbosity.upper(), format='%(asctime)s - %(levelname)s - %(message)s')
			
 
				+    return logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+def handle_output(args, results, logger):
			
 
				+    if not args.output_path:
			
 
				+        if args.log_samples:
			
 
				+            logger.error("Specify --output_path for logging samples.")
			
 
				+            sys.exit(1)
			
 
				+        logger.info(json.dumps(results, indent=2, default=_handle_non_serializable))
			
 
				+        return
			
 
				+
			
 
				+    path = Path(args.output_path)
			
 
				+    if path.is_file() or path.with_name("results.json").is_file():
			
 
				+        logger.warning(f"File already exists at {path}. Results will be overwritten.")
			
 
				+
			
 
				+    output_dir = path.parent if path.suffix in (".json", ".jsonl") else path
			
 
				+    output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    results_str = json.dumps(results, indent=2, default=_handle_non_serializable)
			
 
				+    if args.show_config:
			
 
				+        logger.info(results_str)
			
 
				+
			
 
				+    with open(args.output_path, 'w', encoding='utf-8') as f:
			
 
				+        f.write(results_str)
			
 
				+
			
 
				+    if args.log_samples:
			
 
				+        samples = results.pop("samples", {})
			
 
				+        for task_name, _ in results.get("configs", {}).items():
			
 
				+            output_name = re.sub(r"/|=", "__", args.model_args) + "_" + task_name
			
 
				+            sample_file = output_dir.joinpath(f"{output_name}.jsonl")
			
 
				+            sample_data = json.dumps(samples.get(task_name, {}), indent=2, default=_handle_non_serializable)
			
 
				+            sample_file.write_text(sample_data, encoding="utf-8")
			
 
				+
			
 
				+    batch_sizes = ",".join(map(str, results.get("config", {}).get("batch_sizes", [])))
			
 
				+    summary = f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
			
 
				+    logger.info(summary)
			
 
				+    logger.info(make_table(results))
			
 
				+    if "groups" in results:
			
 
				+        logger.info(make_table(results, "groups"))
			
 
				+
			
 
				+    
			
 
				+def load_tasks(args):
			
 
				+    tasks.initialize_tasks()
			
 
				+    if args.open_llm_leaderboard_tasks:
			
 
				+        current_dir = os.getcwd()
			
 
				+        config_dir = os.path.join(current_dir, "open_llm_leaderboard")
			
 
				+        lm_eval.tasks.include_path(config_dir)
			
 
				+        return [
			
 
				+            "arc_challenge_25_shot",
			
 
				+            "hellaswag_10_shot",
			
 
				+            "truthfulqa_mc2",
			
 
				+            "winogrande_5_shot",
			
 
				+            "gsm8k"
			
 
				+        ]
			
 
				+    return args.tasks.split(",") if args.tasks else []
			
 
				+        
			
 
				+def parse_eval_args():
			
 
				+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
			
 
				+    parser.add_argument("--model", "-m", default="hf", help="Name of model, e.g., `hf`.")
			
 
				+    parser.add_argument("--tasks", "-t", default=None, help="Comma-separated list of tasks, or 'list' to display available tasks.")
			
 
				+    parser.add_argument("--model_args", "-a", default="", help="Comma-separated string arguments for model, e.g., `pretrained=EleutherAI/pythia-160m`.")
			
 
				+    parser.add_argument("--open-llm-leaderboard-tasks", "-oplm", action="store_true", default=False, help="Choose the list of tasks with specification in HF open LLM-leaderboard.")
			
 
				+    parser.add_argument("--num_fewshot", "-f", type=int, default=None, help="Number of examples in few-shot context.")
			
 
				+    parser.add_argument("--batch_size", "-b", default=1, help="Batch size, can be 'auto', 'auto:N', or an integer.")
			
 
				+    parser.add_argument("--max_batch_size", type=int, default=None, help="Maximal batch size with 'auto' batch size.")
			
 
				+    parser.add_argument("--device", default=None, help="Device for evaluation, e.g., 'cuda', 'cpu'.")
			
 
				+    parser.add_argument("--output_path", "-o", type=str, default=None, help="Path for saving results.")
			
 
				+    parser.add_argument("--limit", "-L", type=float, default=None, help="Limit number of examples per task.")
			
 
				+    parser.add_argument("--use_cache", "-c", default=None, help="Path to cache db file, if used.")
			
 
				+    parser.add_argument("--verbosity", "-v", default="INFO", help="Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.")
			
 
				+    parser.add_argument("--gen_kwargs", default=None, help="Generation kwargs for tasks that support it.")
			
 
				+    parser.add_argument("--check_integrity", action="store_true", help="Whether to run the relevant part of the test suite for the tasks.")
			
 
				+    parser.add_argument("--write_out", "-w", action="store_true", default=False, help="Prints the prompt for the first few documents.")
			
 
				+    parser.add_argument("--log_samples", "-s", action="store_true", default=False, help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis.")
			
 
				+    parser.add_argument("--show_config", action="store_true", default=False, help="If True, shows the full config of all tasks at the end of the evaluation.")
			
 
				+    parser.add_argument("--include_path", type=str, default=None, help="Additional path to include if there are external tasks.")
			
 
				+    parser.add_argument("--decontamination_ngrams_path", default=None)  # Not currently used
			
 
				+    return parser.parse_args()
			
 
				+
			
 
				+
			
 
				+def evaluate_model(args):
			
 
				+    try:
			
 
				+        task_list = load_tasks(args)
			
 
				+    # Customized model such as Quantized model etc.
			
 
				+    # In case you are working with a custom model, you can use the following guide to add it here:
			
 
				+    #https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
			
 
				+    
			
 
				+    # Evaluate
			
 
				+        results = evaluator.simple_evaluate(
			
 
				+        model=args.model,
			
 
				+            model_args=args.model_args,
			
 
				+            tasks=task_list,
			
 
				+            num_fewshot=args.num_fewshot,
			
 
				+            batch_size=args.batch_size,
			
 
				+            max_batch_size=args.max_batch_size,
			
 
				+            device=args.device,
			
 
				+            use_cache=args.use_cache,
			
 
				+            limit=args.limit,
			
 
				+            decontamination_ngrams_path=args.decontamination_ngrams_path,
			
 
				+            check_integrity=args.check_integrity,
			
 
				+            write_out=args.write_out,
			
 
				+            log_samples=args.log_samples,
			
 
				+            gen_kwargs=args.gen_kwargs,
			
 
				+        )
			
 
				+        handle_output(args, results, logger)
			
 
				+            
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"An error occurred during evaluation: {e}")
			
 
				+        sys.exit(1)
			
 
				+if __name__ == "__main__":
			
 
				+    args = parse_eval_args()
			
 
				+    logger = setup_logging(args.verbosity)
			
 
				+    evaluate_model(args)
			
--- a/eval/eval_main.py
+++ b/eval/eval_main.py
@@ -0,0 +1,287 @@
 
				+import argparse
			
 
				+import json
			
 
				+import logging
			
 
				+import os
			
 
				+import re
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from typing import Union
			
 
				+
			
 
				+import numpy as np
			
 
				+
			
 
				+from lm_eval import evaluator, utils
			
 
				+from lm_eval.api.registry import ALL_TASKS
			
 
				+from lm_eval.tasks import include_path, initialize_tasks
			
 
				+from lm_eval.utils import make_table
			
 
				+
			
 
				+
			
 
				+def _handle_non_serializable(o):
			
 
				+    if isinstance(o, np.int64) or isinstance(o, np.int32):
			
 
				+        return int(o)
			
 
				+    elif isinstance(o, set):
			
 
				+        return list(o)
			
 
				+    else:
			
 
				+        return str(o)
			
 
				+
			
 
				+
			
 
				+def parse_eval_args() -> argparse.Namespace:
			
 
				+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
			
 
				+    parser.add_argument("--model", "-m", default="hf", help="Name of model e.g. `hf`")
			
 
				+    parser.add_argument(
			
 
				+        "--tasks",
			
 
				+        "-t",
			
 
				+        default=None,
			
 
				+        metavar="task1,task2",
			
 
				+        help="To get full list of tasks, use the command lm-eval --tasks list",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--model_args",
			
 
				+        "-a",
			
 
				+        default="",
			
 
				+        help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--num_fewshot",
			
 
				+        "-f",
			
 
				+        type=int,
			
 
				+        default=None,
			
 
				+        metavar="N",
			
 
				+        help="Number of examples in few-shot context",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--batch_size",
			
 
				+        "-b",
			
 
				+        type=str,
			
 
				+        default=1,
			
 
				+        metavar="auto|auto:N|N",
			
 
				+        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--max_batch_size",
			
 
				+        type=int,
			
 
				+        default=None,
			
 
				+        metavar="N",
			
 
				+        help="Maximal batch size to try with --batch_size auto.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--device",
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        help="Device to use (e.g. cuda, cuda:0, cpu).",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--output_path",
			
 
				+        "-o",
			
 
				+        default=None,
			
 
				+        type=str,
			
 
				+        metavar="DIR|DIR/file.json",
			
 
				+        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--limit",
			
 
				+        "-L",
			
 
				+        type=float,
			
 
				+        default=None,
			
 
				+        metavar="N|0<N<1",
			
 
				+        help="Limit the number of examples per task. "
			
 
				+        "If <1, limit is a percentage of the total number of examples.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--use_cache",
			
 
				+        "-c",
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        metavar="DIR",
			
 
				+        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
			
 
				+    )
			
 
				+    parser.add_argument("--decontamination_ngrams_path", default=None)  # TODO: not used
			
 
				+    parser.add_argument(
			
 
				+        "--check_integrity",
			
 
				+        action="store_true",
			
 
				+        help="Whether to run the relevant part of the test suite for the tasks.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--write_out",
			
 
				+        "-w",
			
 
				+        action="store_true",
			
 
				+        default=False,
			
 
				+        help="Prints the prompt for the first few documents.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--log_samples",
			
 
				+        "-s",
			
 
				+        action="store_true",
			
 
				+        default=False,
			
 
				+        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--show_config",
			
 
				+        action="store_true",
			
 
				+        default=False,
			
 
				+        help="If True, shows the the full config of all tasks at the end of the evaluation.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--include_path",
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        metavar="DIR",
			
 
				+        help="Additional path to include if there are external tasks to include.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--gen_kwargs",
			
 
				+        default=None,
			
 
				+        help=(
			
 
				+            "String arguments for model generation on greedy_until tasks,"
			
 
				+            " e.g. `temperature=0,top_k=0,top_p=0`."
			
 
				+        ),
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--verbosity",
			
 
				+        "-v",
			
 
				+        type=str.upper,
			
 
				+        default="INFO",
			
 
				+        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
			
 
				+        help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
			
 
				+    )
			
 
				+    return parser.parse_args()
			
 
				+
			
 
				+
			
 
				+def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
			
 
				+    if not args:
			
 
				+        # we allow for args to be passed externally, else we parse them ourselves
			
 
				+        args = parse_eval_args()
			
 
				+
			
 
				+    eval_logger = utils.eval_logger
			
 
				+    eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
			
 
				+    eval_logger.info(f"Verbosity set to {args.verbosity}")
			
 
				+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
			
 
				+
			
 
				+    initialize_tasks(args.verbosity)
			
 
				+
			
 
				+    if args.limit:
			
 
				+        eval_logger.warning(
			
 
				+            " --limit SHOULD ONLY BE USED FOR TESTING."
			
 
				+            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
			
 
				+        )
			
 
				+    if args.include_path is not None:
			
 
				+        eval_logger.info(f"Including path: {args.include_path}")
			
 
				+        include_path(args.include_path)
			
 
				+
			
 
				+    if args.tasks is None:
			
 
				+        task_names = ALL_TASKS
			
 
				+    elif args.tasks == "list":
			
 
				+        eval_logger.info(
			
 
				+            "Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS)))
			
 
				+        )
			
 
				+        sys.exit()
			
 
				+    else:
			
 
				+        if os.path.isdir(args.tasks):
			
 
				+            import glob
			
 
				+
			
 
				+            task_names = []
			
 
				+            yaml_path = os.path.join(args.tasks, "*.yaml")
			
 
				+            for yaml_file in glob.glob(yaml_path):
			
 
				+                config = utils.load_yaml_config(yaml_file)
			
 
				+                task_names.append(config)
			
 
				+        else:
			
 
				+            tasks_list = args.tasks.split(",")
			
 
				+            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
			
 
				+            for task in [task for task in tasks_list if task not in task_names]:
			
 
				+                if os.path.isfile(task):
			
 
				+                    config = utils.load_yaml_config(task)
			
 
				+                    task_names.append(config)
			
 
				+            task_missing = [
			
 
				+                task
			
 
				+                for task in tasks_list
			
 
				+                if task not in task_names and "*" not in task
			
 
				+            ]  # we don't want errors if a wildcard ("*") task name was used
			
 
				+
			
 
				+            if task_missing:
			
 
				+                missing = ", ".join(task_missing)
			
 
				+                eval_logger.error(
			
 
				+                    f"Tasks were not found: {missing}\n"
			
 
				+                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
			
 
				+                )
			
 
				+                raise ValueError(
			
 
				+                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
			
 
				+                )
			
 
				+
			
 
				+    if args.output_path:
			
 
				+        path = Path(args.output_path)
			
 
				+        # check if file or 'dir/results.json' exists
			
 
				+        if path.is_file() or Path(args.output_path).joinpath("results.json").is_file():
			
 
				+            eval_logger.warning(
			
 
				+                f"File already exists at {path}. Results will be overwritten."
			
 
				+            )
			
 
				+            output_path_file = path.joinpath("results.json")
			
 
				+            assert not path.is_file(), "File already exists"
			
 
				+        # if path json then get parent dir
			
 
				+        elif path.suffix in (".json", ".jsonl"):
			
 
				+            output_path_file = path
			
 
				+            path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+            path = path.parent
			
 
				+        else:
			
 
				+            path.mkdir(parents=True, exist_ok=True)
			
 
				+            output_path_file = path.joinpath("results.json")
			
 
				+    elif args.log_samples and not args.output_path:
			
 
				+        assert args.output_path, "Specify --output_path"
			
 
				+
			
 
				+    eval_logger.info(f"Selected Tasks: {task_names}")
			
 
				+    print(f"type of model args: {type(args.model_args)}")
			
 
				+    print("*************************************")
			
 
				+    results = evaluator.simple_evaluate(
			
 
				+        model=args.model,
			
 
				+        model_args=args.model_args,
			
 
				+        tasks=task_names,
			
 
				+        num_fewshot=args.num_fewshot,
			
 
				+        batch_size=args.batch_size,
			
 
				+        max_batch_size=args.max_batch_size,
			
 
				+        device=args.device,
			
 
				+        use_cache=args.use_cache,
			
 
				+        limit=args.limit,
			
 
				+        decontamination_ngrams_path=args.decontamination_ngrams_path,
			
 
				+        check_integrity=args.check_integrity,
			
 
				+        write_out=args.write_out,
			
 
				+        log_samples=args.log_samples,
			
 
				+        gen_kwargs=args.gen_kwargs,
			
 
				+    )
			
 
				+
			
 
				+    if results is not None:
			
 
				+        if args.log_samples:
			
 
				+            samples = results.pop("samples")
			
 
				+        dumped = json.dumps(
			
 
				+            results, indent=2, default=_handle_non_serializable, ensure_ascii=False
			
 
				+        )
			
 
				+        if args.show_config:
			
 
				+            print(dumped)
			
 
				+
			
 
				+        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
			
 
				+
			
 
				+        if args.output_path:
			
 
				+            output_path_file.open("w").write(dumped)
			
 
				+
			
 
				+            if args.log_samples:
			
 
				+                for task_name, config in results["configs"].items():
			
 
				+                    output_name = "{}_{}".format(
			
 
				+                        re.sub("/|=", "__", args.model_args), task_name
			
 
				+                    )
			
 
				+                    filename = path.joinpath(f"{output_name}.jsonl")
			
 
				+                    samples_dumped = json.dumps(
			
 
				+                        samples[task_name],
			
 
				+                        indent=2,
			
 
				+                        default=_handle_non_serializable,
			
 
				+                        ensure_ascii=False,
			
 
				+                    )
			
 
				+                    filename.write_text(samples_dumped, encoding="utf-8")
			
 
				+
			
 
				+        print(
			
 
				+            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
			
 
				+            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
			
 
				+        )
			
 
				+        print(make_table(results))
			
 
				+        if "groups" in results:
			
 
				+            print(make_table(results, "groups"))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    cli_evaluate()
			
--- a/eval/evaluate_lm.py
+++ b/eval/evaluate_lm.py
@@ -0,0 +1,83 @@
 
				+import fire
			
 
				+from lm_eval.base import LM
			
 
				+from lm_eval import tasks, evaluator
			
 
				+from transformers import AutoModelForCausalLM, AutoTokenizer
			
 
				+import torch
			
 
				+
			
 
				+class HuggingFaceModel(LM):
			
 
				+    def __init__(self, model_name, tokenizer_name):
			
 
				+        self.model = AutoModelForCausalLM.from_pretrained(model_name)
			
 
				+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
			
 
				+
			
 
				+    def loglikelihood(self, ctx, cont):
			
 
				+        # Encode context and continuation
			
 
				+        input_ids = self.tokenizer.encode(ctx, add_special_tokens=False)
			
 
				+        cont_ids = self.tokenizer.encode(cont, add_special_tokens=False)
			
 
				+
			
 
				+        # Concatenate context and continuation
			
 
				+        input_ids += cont_ids
			
 
				+
			
 
				+        # Calculate log likelihood
			
 
				+        with torch.no_grad():
			
 
				+            outputs = self.model(input_ids=torch.tensor([input_ids]), labels=torch.tensor([input_ids]))
			
 
				+            log_likelihood = -outputs.loss.item() * len(cont_ids)
			
 
				+
			
 
				+        return log_likelihood, len(cont_ids)
			
 
				+    
			
 
				+    @property
			
 
				+    def eot_token_id(self):
			
 
				+        return self._tokenizer.eos_id()
			
 
				+
			
 
				+    @property
			
 
				+    def max_length(self):
			
 
				+        return self._max_seq_length
			
 
				+
			
 
				+    @property
			
 
				+    def max_gen_toks(self):
			
 
				+        return 50
			
 
				+
			
 
				+    @property
			
 
				+    def batch_size(self):
			
 
				+        return 1
			
 
				+
			
 
				+    @property
			
 
				+    def device(self):
			
 
				+        return self._device
			
 
				+
			
 
				+    def tok_encode(self, string: str):
			
 
				+        encoded = encode_tokens(self._tokenizer,
			
 
				+            string, bos=True, eos=False, device=self._device)
			
 
				+        # encoded is a pytorch tensor, but some internal logic in the
			
 
				+        # eval harness expects it to be a list instead
			
 
				+        # TODO: verify this for multi-batch as well
			
 
				+        encoded = encoded.tolist()
			
 
				+        return encoded
			
 
				+
			
 
				+    def tok_decode(self, tokens):
			
 
				+        decoded = self._tokenizer.decode(tokens)
			
 
				+        return decoded
			
 
				+
			
 
				+    def _model_call(self, inputss):
			
 
				+
			
 
				+        max_new_tokens = 1
			
 
				+       
			
 
				+        logits = model(inputs)
			
 
				+        return logits
			
 
				+    
			
 
				+    def _model_generate(self, context, max_length, eos_token_id):
			
 
				+        raise Exception('unimplemented')
			
 
				+    # Implement other required methods if needed
			
 
				+
			
 
				+def evaluate_model(model_name, tokenizer_name, task_list):
			
 
				+    # Instantiate the model
			
 
				+    model = HuggingFaceModel(model_name, tokenizer_name)
			
 
				+
			
 
				+    # Convert task_list string to list
			
 
				+    task_list = task_list.split(',')
			
 
				+
			
 
				+    # Evaluate
			
 
				+    results = evaluator.evaluate(lm=model, tasks=tasks.get_task_dict(task_list), provide_description=True)
			
 
				+    print(results)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    fire.Fire(evaluate_model)
			
--- a/eval/open_llm_eval_prep.sh
+++ b/eval/open_llm_eval_prep.sh
@@ -0,0 +1,22 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# Prompt the user for the EVAL_PATH
			
 
				+read -p "Enter the asbolute path to the lm-evaluation-harness: " EVAL_PATH
			
 
				+conda activate 
			
 
				+# Directory containing YAML files
			
 
				+DIR="open_llm_leaderboard"
			
 
				+
			
 
				+# Check if the directory exists
			
 
				+if [ ! -d "$DIR" ]; then
			
 
				+    echo "Error: Directory '$DIR' not found."
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# Iterate over YAML files in the directory and update them
			
 
				+for YAML_FILE in "$DIR"/*.yaml
			
 
				+do
			
 
				+    if [ -f "$YAML_FILE" ]; then
			
 
				+        sed -i 's|{\$EVAL_PATH}|'"$EVAL_PATH"'|g' "$YAML_FILE"
			
 
				+        echo "Updated $YAML_FILE with EVAL_PATH: $EVAL_PATH"
			
 
				+    fi
			
 
				+done
			
--- a/eval/open_llm_leaderboard/arc_challeneg_25shots.yaml
+++ b/eval/open_llm_leaderboard/arc_challeneg_25shots.yaml
@@ -0,0 +1,6 @@
 
				+include: {$EVAL_PATH}/lm_eval/tasks/arc/arc_challenge.yaml
			
 
				+task: arc_challenge_25_shot
			
 
				+task_alias: arc 25 shot
			
 
				+num_fewshot: 25
			
 
				+metric_list:
			
 
				+  - metric: acc_norm
			
--- a/eval/open_llm_leaderboard/hellaswag_10shots.yaml
+++ b/eval/open_llm_leaderboard/hellaswag_10shots.yaml
@@ -0,0 +1,6 @@
 
				+include: {$EVAL_PATH}/lm_eval/tasks/hellaswag/hellaswag.yaml
			
 
				+task: hellaswag_10_shot
			
 
				+task_alias: hellaswag 10 shot
			
 
				+num_fewshot: 10
			
 
				+metric_list:
			
 
				+  - metric: acc_norm
			
--- a/eval/open_llm_leaderboard/hellaswag_utils.py
+++ b/eval/open_llm_leaderboard/hellaswag_utils.py
@@ -0,0 +1,24 @@
 
				+import datasets
			
 
				+import re
			
 
				+
			
 
				+
			
 
				+def preprocess(text):
			
 
				+    text = text.strip()
			
 
				+    # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
			
 
				+    text = text.replace(" [title]", ". ")
			
 
				+    text = re.sub("\\[.*?\\]", "", text)
			
 
				+    text = text.replace("  ", " ")
			
 
				+    return text
			
 
				+
			
 
				+
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc):
			
 
				+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
			
 
				+        out_doc = {
			
 
				+            "query": preprocess(doc["activity_label"] + ": " + ctx),
			
 
				+            "choices": [preprocess(ending) for ending in doc["endings"]],
			
 
				+            "gold": int(doc["label"]),
			
 
				+        }
			
 
				+        return out_doc
			
 
				+
			
 
				+    return dataset.map(_process_doc)
			
--- a/eval/open_llm_leaderboard/winogrande_5shots.yaml
+++ b/eval/open_llm_leaderboard/winogrande_5shots.yaml
@@ -0,0 +1,6 @@
 
				+include: {$EVAL_PATH}/lm_eval/tasks/winogrande/default.yaml
			
 
				+task: winogrande_5_shot
			
 
				+task_alias: winogrande 5 shot
			
 
				+num_fewshot: 5
			
 
				+metric_list:
			
 
				+  - metric: acc