Browse Source

adding eval harness pipeline

Hamid Shojanazeri 1 year ago
parent
commit
2c2fcd14e4

File diff suppressed because it is too large
+ 97 - 0
eval/README.md


+ 13 - 0
eval/boolq_new.yaml

@@ -0,0 +1,13 @@
+task: demo_boolq_5
+dataset_path: super_glue
+dataset_name: boolq
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ["no", "yes"]
+should_decontaminate: true
+doc_to_decontamination_query: passage
+metric_list:
+  - metric: acc

+ 137 - 0
eval/eval.py

@@ -0,0 +1,137 @@
+import argparse
+import json
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+
+import numpy as np
+import lm_eval
+from lm_eval import evaluator, tasks
+from lm_eval.utils import make_table, load_yaml_config
+
+
+def _handle_non_serializable(o):
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+
+def setup_logging(verbosity):
+    logging.basicConfig(level=verbosity.upper(), format='%(asctime)s - %(levelname)s - %(message)s')
+    return logging.getLogger(__name__)
+
+
+def handle_output(args, results, logger):
+    if not args.output_path:
+        if args.log_samples:
+            logger.error("Specify --output_path for logging samples.")
+            sys.exit(1)
+        logger.info(json.dumps(results, indent=2, default=_handle_non_serializable))
+        return
+
+    path = Path(args.output_path)
+    if path.is_file() or path.with_name("results.json").is_file():
+        logger.warning(f"File already exists at {path}. Results will be overwritten.")
+
+    output_dir = path.parent if path.suffix in (".json", ".jsonl") else path
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    results_str = json.dumps(results, indent=2, default=_handle_non_serializable)
+    if args.show_config:
+        logger.info(results_str)
+
+    with open(args.output_path, 'w', encoding='utf-8') as f:
+        f.write(results_str)
+
+    if args.log_samples:
+        samples = results.pop("samples", {})
+        for task_name, _ in results.get("configs", {}).items():
+            output_name = re.sub(r"/|=", "__", args.model_args) + "_" + task_name
+            sample_file = output_dir.joinpath(f"{output_name}.jsonl")
+            sample_data = json.dumps(samples.get(task_name, {}), indent=2, default=_handle_non_serializable)
+            sample_file.write_text(sample_data, encoding="utf-8")
+
+    batch_sizes = ",".join(map(str, results.get("config", {}).get("batch_sizes", [])))
+    summary = f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
+    logger.info(summary)
+    logger.info(make_table(results))
+    if "groups" in results:
+        logger.info(make_table(results, "groups"))
+
+    
+def load_tasks(args):
+    tasks.initialize_tasks()
+    if args.open_llm_leaderboard_tasks:
+        current_dir = os.getcwd()
+        config_dir = os.path.join(current_dir, "open_llm_leaderboard")
+        lm_eval.tasks.include_path(config_dir)
+        return [
+            "arc_challenge_25_shot",
+            "hellaswag_10_shot",
+            "truthfulqa_mc2",
+            "winogrande_5_shot",
+            "gsm8k"
+        ]
+    return args.tasks.split(",") if args.tasks else []
+        
+def parse_eval_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("--model", "-m", default="hf", help="Name of model, e.g., `hf`.")
+    parser.add_argument("--tasks", "-t", default=None, help="Comma-separated list of tasks, or 'list' to display available tasks.")
+    parser.add_argument("--model_args", "-a", default="", help="Comma-separated string arguments for model, e.g., `pretrained=EleutherAI/pythia-160m`.")
+    parser.add_argument("--open-llm-leaderboard-tasks", "-oplm", action="store_true", default=False, help="Choose the list of tasks with specification in HF open LLM-leaderboard.")
+    parser.add_argument("--num_fewshot", "-f", type=int, default=None, help="Number of examples in few-shot context.")
+    parser.add_argument("--batch_size", "-b", default=1, help="Batch size, can be 'auto', 'auto:N', or an integer.")
+    parser.add_argument("--max_batch_size", type=int, default=None, help="Maximal batch size with 'auto' batch size.")
+    parser.add_argument("--device", default=None, help="Device for evaluation, e.g., 'cuda', 'cpu'.")
+    parser.add_argument("--output_path", "-o", type=str, default=None, help="Path for saving results.")
+    parser.add_argument("--limit", "-L", type=float, default=None, help="Limit number of examples per task.")
+    parser.add_argument("--use_cache", "-c", default=None, help="Path to cache db file, if used.")
+    parser.add_argument("--verbosity", "-v", default="INFO", help="Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.")
+    parser.add_argument("--gen_kwargs", default=None, help="Generation kwargs for tasks that support it.")
+    parser.add_argument("--check_integrity", action="store_true", help="Whether to run the relevant part of the test suite for the tasks.")
+    parser.add_argument("--write_out", "-w", action="store_true", default=False, help="Prints the prompt for the first few documents.")
+    parser.add_argument("--log_samples", "-s", action="store_true", default=False, help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis.")
+    parser.add_argument("--show_config", action="store_true", default=False, help="If True, shows the full config of all tasks at the end of the evaluation.")
+    parser.add_argument("--include_path", type=str, default=None, help="Additional path to include if there are external tasks.")
+    parser.add_argument("--decontamination_ngrams_path", default=None)  # Not currently used
+    return parser.parse_args()
+
+
+def evaluate_model(args):
+    try:
+        task_list = load_tasks(args)
+    # Customized model such as Quantized model etc.
+    # In case you are working with a custom model, you can use the following guide to add it here:
+    #https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
+    
+    # Evaluate
+        results = evaluator.simple_evaluate(
+        model=args.model,
+            model_args=args.model_args,
+            tasks=task_list,
+            num_fewshot=args.num_fewshot,
+            batch_size=args.batch_size,
+            max_batch_size=args.max_batch_size,
+            device=args.device,
+            use_cache=args.use_cache,
+            limit=args.limit,
+            decontamination_ngrams_path=args.decontamination_ngrams_path,
+            check_integrity=args.check_integrity,
+            write_out=args.write_out,
+            log_samples=args.log_samples,
+            gen_kwargs=args.gen_kwargs,
+        )
+        handle_output(args, results, logger)
+            
+    except Exception as e:
+        logger.error(f"An error occurred during evaluation: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    args = parse_eval_args()
+    logger = setup_logging(args.verbosity)
+    evaluate_model(args)

+ 287 - 0
eval/eval_main.py

@@ -0,0 +1,287 @@
+import argparse
+import json
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+
+from lm_eval import evaluator, utils
+from lm_eval.api.registry import ALL_TASKS
+from lm_eval.tasks import include_path, initialize_tasks
+from lm_eval.utils import make_table
+
+
+def _handle_non_serializable(o):
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+
+
+def parse_eval_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("--model", "-m", default="hf", help="Name of model e.g. `hf`")
+    parser.add_argument(
+        "--tasks",
+        "-t",
+        default=None,
+        metavar="task1,task2",
+        help="To get full list of tasks, use the command lm-eval --tasks list",
+    )
+    parser.add_argument(
+        "--model_args",
+        "-a",
+        default="",
+        help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
+    )
+    parser.add_argument(
+        "--num_fewshot",
+        "-f",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Number of examples in few-shot context",
+    )
+    parser.add_argument(
+        "--batch_size",
+        "-b",
+        type=str,
+        default=1,
+        metavar="auto|auto:N|N",
+        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Maximal batch size to try with --batch_size auto.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="Device to use (e.g. cuda, cuda:0, cpu).",
+    )
+    parser.add_argument(
+        "--output_path",
+        "-o",
+        default=None,
+        type=str,
+        metavar="DIR|DIR/file.json",
+        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
+    )
+    parser.add_argument(
+        "--limit",
+        "-L",
+        type=float,
+        default=None,
+        metavar="N|0<N<1",
+        help="Limit the number of examples per task. "
+        "If <1, limit is a percentage of the total number of examples.",
+    )
+    parser.add_argument(
+        "--use_cache",
+        "-c",
+        type=str,
+        default=None,
+        metavar="DIR",
+        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
+    )
+    parser.add_argument("--decontamination_ngrams_path", default=None)  # TODO: not used
+    parser.add_argument(
+        "--check_integrity",
+        action="store_true",
+        help="Whether to run the relevant part of the test suite for the tasks.",
+    )
+    parser.add_argument(
+        "--write_out",
+        "-w",
+        action="store_true",
+        default=False,
+        help="Prints the prompt for the first few documents.",
+    )
+    parser.add_argument(
+        "--log_samples",
+        "-s",
+        action="store_true",
+        default=False,
+        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
+    )
+    parser.add_argument(
+        "--show_config",
+        action="store_true",
+        default=False,
+        help="If True, shows the the full config of all tasks at the end of the evaluation.",
+    )
+    parser.add_argument(
+        "--include_path",
+        type=str,
+        default=None,
+        metavar="DIR",
+        help="Additional path to include if there are external tasks to include.",
+    )
+    parser.add_argument(
+        "--gen_kwargs",
+        default=None,
+        help=(
+            "String arguments for model generation on greedy_until tasks,"
+            " e.g. `temperature=0,top_k=0,top_p=0`."
+        ),
+    )
+    parser.add_argument(
+        "--verbosity",
+        "-v",
+        type=str.upper,
+        default="INFO",
+        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
+        help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
+    )
+    return parser.parse_args()
+
+
+def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
+    if not args:
+        # we allow for args to be passed externally, else we parse them ourselves
+        args = parse_eval_args()
+
+    eval_logger = utils.eval_logger
+    eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
+    eval_logger.info(f"Verbosity set to {args.verbosity}")
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+    initialize_tasks(args.verbosity)
+
+    if args.limit:
+        eval_logger.warning(
+            " --limit SHOULD ONLY BE USED FOR TESTING."
+            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
+        include_path(args.include_path)
+
+    if args.tasks is None:
+        task_names = ALL_TASKS
+    elif args.tasks == "list":
+        eval_logger.info(
+            "Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS)))
+        )
+        sys.exit()
+    else:
+        if os.path.isdir(args.tasks):
+            import glob
+
+            task_names = []
+            yaml_path = os.path.join(args.tasks, "*.yaml")
+            for yaml_file in glob.glob(yaml_path):
+                config = utils.load_yaml_config(yaml_file)
+                task_names.append(config)
+        else:
+            tasks_list = args.tasks.split(",")
+            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
+            for task in [task for task in tasks_list if task not in task_names]:
+                if os.path.isfile(task):
+                    config = utils.load_yaml_config(task)
+                    task_names.append(config)
+            task_missing = [
+                task
+                for task in tasks_list
+                if task not in task_names and "*" not in task
+            ]  # we don't want errors if a wildcard ("*") task name was used
+
+            if task_missing:
+                missing = ", ".join(task_missing)
+                eval_logger.error(
+                    f"Tasks were not found: {missing}\n"
+                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
+                )
+                raise ValueError(
+                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
+                )
+
+    if args.output_path:
+        path = Path(args.output_path)
+        # check if file or 'dir/results.json' exists
+        if path.is_file() or Path(args.output_path).joinpath("results.json").is_file():
+            eval_logger.warning(
+                f"File already exists at {path}. Results will be overwritten."
+            )
+            output_path_file = path.joinpath("results.json")
+            assert not path.is_file(), "File already exists"
+        # if path json then get parent dir
+        elif path.suffix in (".json", ".jsonl"):
+            output_path_file = path
+            path.parent.mkdir(parents=True, exist_ok=True)
+            path = path.parent
+        else:
+            path.mkdir(parents=True, exist_ok=True)
+            output_path_file = path.joinpath("results.json")
+    elif args.log_samples and not args.output_path:
+        assert args.output_path, "Specify --output_path"
+
+    eval_logger.info(f"Selected Tasks: {task_names}")
+    print(f"type of model args: {type(args.model_args)}")
+    print("*************************************")
+    results = evaluator.simple_evaluate(
+        model=args.model,
+        model_args=args.model_args,
+        tasks=task_names,
+        num_fewshot=args.num_fewshot,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        device=args.device,
+        use_cache=args.use_cache,
+        limit=args.limit,
+        decontamination_ngrams_path=args.decontamination_ngrams_path,
+        check_integrity=args.check_integrity,
+        write_out=args.write_out,
+        log_samples=args.log_samples,
+        gen_kwargs=args.gen_kwargs,
+    )
+
+    if results is not None:
+        if args.log_samples:
+            samples = results.pop("samples")
+        dumped = json.dumps(
+            results, indent=2, default=_handle_non_serializable, ensure_ascii=False
+        )
+        if args.show_config:
+            print(dumped)
+
+        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
+
+        if args.output_path:
+            output_path_file.open("w").write(dumped)
+
+            if args.log_samples:
+                for task_name, config in results["configs"].items():
+                    output_name = "{}_{}".format(
+                        re.sub("/|=", "__", args.model_args), task_name
+                    )
+                    filename = path.joinpath(f"{output_name}.jsonl")
+                    samples_dumped = json.dumps(
+                        samples[task_name],
+                        indent=2,
+                        default=_handle_non_serializable,
+                        ensure_ascii=False,
+                    )
+                    filename.write_text(samples_dumped, encoding="utf-8")
+
+        print(
+            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
+            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
+        )
+        print(make_table(results))
+        if "groups" in results:
+            print(make_table(results, "groups"))
+
+
+if __name__ == "__main__":
+    cli_evaluate()

+ 83 - 0
eval/evaluate_lm.py

@@ -0,0 +1,83 @@
+import fire
+from lm_eval.base import LM
+from lm_eval import tasks, evaluator
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+class HuggingFaceModel(LM):
+    def __init__(self, model_name, tokenizer_name):
+        self.model = AutoModelForCausalLM.from_pretrained(model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+    def loglikelihood(self, ctx, cont):
+        # Encode context and continuation
+        input_ids = self.tokenizer.encode(ctx, add_special_tokens=False)
+        cont_ids = self.tokenizer.encode(cont, add_special_tokens=False)
+
+        # Concatenate context and continuation
+        input_ids += cont_ids
+
+        # Calculate log likelihood
+        with torch.no_grad():
+            outputs = self.model(input_ids=torch.tensor([input_ids]), labels=torch.tensor([input_ids]))
+            log_likelihood = -outputs.loss.item() * len(cont_ids)
+
+        return log_likelihood, len(cont_ids)
+    
+    @property
+    def eot_token_id(self):
+        return self._tokenizer.eos_id()
+
+    @property
+    def max_length(self):
+        return self._max_seq_length
+
+    @property
+    def max_gen_toks(self):
+        return 50
+
+    @property
+    def batch_size(self):
+        return 1
+
+    @property
+    def device(self):
+        return self._device
+
+    def tok_encode(self, string: str):
+        encoded = encode_tokens(self._tokenizer,
+            string, bos=True, eos=False, device=self._device)
+        # encoded is a pytorch tensor, but some internal logic in the
+        # eval harness expects it to be a list instead
+        # TODO: verify this for multi-batch as well
+        encoded = encoded.tolist()
+        return encoded
+
+    def tok_decode(self, tokens):
+        decoded = self._tokenizer.decode(tokens)
+        return decoded
+
+    def _model_call(self, inputss):
+
+        max_new_tokens = 1
+       
+        logits = model(inputs)
+        return logits
+    
+    def _model_generate(self, context, max_length, eos_token_id):
+        raise Exception('unimplemented')
+    # Implement other required methods if needed
+
+def evaluate_model(model_name, tokenizer_name, task_list):
+    # Instantiate the model
+    model = HuggingFaceModel(model_name, tokenizer_name)
+
+    # Convert task_list string to list
+    task_list = task_list.split(',')
+
+    # Evaluate
+    results = evaluator.evaluate(lm=model, tasks=tasks.get_task_dict(task_list), provide_description=True)
+    print(results)
+
+if __name__ == "__main__":
+    fire.Fire(evaluate_model)

+ 22 - 0
eval/open_llm_eval_prep.sh

@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Prompt the user for the EVAL_PATH
+read -p "Enter the asbolute path to the lm-evaluation-harness: " EVAL_PATH
+conda activate 
+# Directory containing YAML files
+DIR="open_llm_leaderboard"
+
+# Check if the directory exists
+if [ ! -d "$DIR" ]; then
+    echo "Error: Directory '$DIR' not found."
+    exit 1
+fi
+
+# Iterate over YAML files in the directory and update them
+for YAML_FILE in "$DIR"/*.yaml
+do
+    if [ -f "$YAML_FILE" ]; then
+        sed -i 's|{\$EVAL_PATH}|'"$EVAL_PATH"'|g' "$YAML_FILE"
+        echo "Updated $YAML_FILE with EVAL_PATH: $EVAL_PATH"
+    fi
+done

+ 6 - 0
eval/open_llm_leaderboard/arc_challeneg_25shots.yaml

@@ -0,0 +1,6 @@
+include: {$EVAL_PATH}/lm_eval/tasks/arc/arc_challenge.yaml
+task: arc_challenge_25_shot
+task_alias: arc 25 shot
+num_fewshot: 25
+metric_list:
+  - metric: acc_norm

+ 6 - 0
eval/open_llm_leaderboard/hellaswag_10shots.yaml

@@ -0,0 +1,6 @@
+include: {$EVAL_PATH}/lm_eval/tasks/hellaswag/hellaswag.yaml
+task: hellaswag_10_shot
+task_alias: hellaswag 10 shot
+num_fewshot: 10
+metric_list:
+  - metric: acc_norm

+ 24 - 0
eval/open_llm_leaderboard/hellaswag_utils.py

@@ -0,0 +1,24 @@
+import datasets
+import re
+
+
+def preprocess(text):
+    text = text.strip()
+    # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+    text = text.replace(" [title]", ". ")
+    text = re.sub("\\[.*?\\]", "", text)
+    text = text.replace("  ", " ")
+    return text
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+        out_doc = {
+            "query": preprocess(doc["activity_label"] + ": " + ctx),
+            "choices": [preprocess(ending) for ending in doc["endings"]],
+            "gold": int(doc["label"]),
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)

+ 6 - 0
eval/open_llm_leaderboard/winogrande_5shots.yaml

@@ -0,0 +1,6 @@
+include: {$EVAL_PATH}/lm_eval/tasks/winogrande/default.yaml
+task: winogrande_5_shot
+task_alias: winogrande 5 shot
+num_fewshot: 5
+metric_list:
+  - metric: acc