1 年之前 · 3012a230bf
--- a/eval/boolq_new.yaml
+++ b/eval/boolq_new.yaml
@@ -1,13 +0,0 @@
 
				-task: demo_boolq_5
			
 
				-dataset_path: super_glue
			
 
				-dataset_name: boolq
			
 
				-output_type: multiple_choice
			
 
				-training_split: train
			
 
				-validation_split: validation
			
 
				-doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
			
 
				-doc_to_target: label
			
 
				-doc_to_choice: ["no", "yes"]
			
 
				-should_decontaminate: true
			
 
				-doc_to_decontamination_query: passage
			
 
				-metric_list:
			
 
				-  - metric: acc
			
--- a/eval/eval_main.py
+++ b/eval/eval_main.py
@@ -1,287 +0,0 @@
 
				-import argparse
			
 
				-import json
			
 
				-import logging
			
 
				-import os
			
 
				-import re
			
 
				-import sys
			
 
				-from pathlib import Path
			
 
				-from typing import Union
			
 
				-
			
 
				-import numpy as np
			
 
				-
			
 
				-from lm_eval import evaluator, utils
			
 
				-from lm_eval.api.registry import ALL_TASKS
			
 
				-from lm_eval.tasks import include_path, initialize_tasks
			
 
				-from lm_eval.utils import make_table
			
 
				-
			
 
				-
			
 
				-def _handle_non_serializable(o):
			
 
				-    if isinstance(o, np.int64) or isinstance(o, np.int32):
			
 
				-        return int(o)
			
 
				-    elif isinstance(o, set):
			
 
				-        return list(o)
			
 
				-    else:
			
 
				-        return str(o)
			
 
				-
			
 
				-
			
 
				-def parse_eval_args() -> argparse.Namespace:
			
 
				-    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
			
 
				-    parser.add_argument("--model", "-m", default="hf", help="Name of model e.g. `hf`")
			
 
				-    parser.add_argument(
			
 
				-        "--tasks",
			
 
				-        "-t",
			
 
				-        default=None,
			
 
				-        metavar="task1,task2",
			
 
				-        help="To get full list of tasks, use the command lm-eval --tasks list",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--model_args",
			
 
				-        "-a",
			
 
				-        default="",
			
 
				-        help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--num_fewshot",
			
 
				-        "-f",
			
 
				-        type=int,
			
 
				-        default=None,
			
 
				-        metavar="N",
			
 
				-        help="Number of examples in few-shot context",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--batch_size",
			
 
				-        "-b",
			
 
				-        type=str,
			
 
				-        default=1,
			
 
				-        metavar="auto|auto:N|N",
			
 
				-        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--max_batch_size",
			
 
				-        type=int,
			
 
				-        default=None,
			
 
				-        metavar="N",
			
 
				-        help="Maximal batch size to try with --batch_size auto.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--device",
			
 
				-        type=str,
			
 
				-        default=None,
			
 
				-        help="Device to use (e.g. cuda, cuda:0, cpu).",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--output_path",
			
 
				-        "-o",
			
 
				-        default=None,
			
 
				-        type=str,
			
 
				-        metavar="DIR|DIR/file.json",
			
 
				-        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--limit",
			
 
				-        "-L",
			
 
				-        type=float,
			
 
				-        default=None,
			
 
				-        metavar="N|0<N<1",
			
 
				-        help="Limit the number of examples per task. "
			
 
				-        "If <1, limit is a percentage of the total number of examples.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--use_cache",
			
 
				-        "-c",
			
 
				-        type=str,
			
 
				-        default=None,
			
 
				-        metavar="DIR",
			
 
				-        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
			
 
				-    )
			
 
				-    parser.add_argument("--decontamination_ngrams_path", default=None)  # TODO: not used
			
 
				-    parser.add_argument(
			
 
				-        "--check_integrity",
			
 
				-        action="store_true",
			
 
				-        help="Whether to run the relevant part of the test suite for the tasks.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--write_out",
			
 
				-        "-w",
			
 
				-        action="store_true",
			
 
				-        default=False,
			
 
				-        help="Prints the prompt for the first few documents.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--log_samples",
			
 
				-        "-s",
			
 
				-        action="store_true",
			
 
				-        default=False,
			
 
				-        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--show_config",
			
 
				-        action="store_true",
			
 
				-        default=False,
			
 
				-        help="If True, shows the the full config of all tasks at the end of the evaluation.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--include_path",
			
 
				-        type=str,
			
 
				-        default=None,
			
 
				-        metavar="DIR",
			
 
				-        help="Additional path to include if there are external tasks to include.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--gen_kwargs",
			
 
				-        default=None,
			
 
				-        help=(
			
 
				-            "String arguments for model generation on greedy_until tasks,"
			
 
				-            " e.g. `temperature=0,top_k=0,top_p=0`."
			
 
				-        ),
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--verbosity",
			
 
				-        "-v",
			
 
				-        type=str.upper,
			
 
				-        default="INFO",
			
 
				-        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
			
 
				-        help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
			
 
				-    )
			
 
				-    return parser.parse_args()
			
 
				-
			
 
				-
			
 
				-def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
			
 
				-    if not args:
			
 
				-        # we allow for args to be passed externally, else we parse them ourselves
			
 
				-        args = parse_eval_args()
			
 
				-
			
 
				-    eval_logger = utils.eval_logger
			
 
				-    eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
			
 
				-    eval_logger.info(f"Verbosity set to {args.verbosity}")
			
 
				-    os.environ["TOKENIZERS_PARALLELISM"] = "false"
			
 
				-
			
 
				-    initialize_tasks(args.verbosity)
			
 
				-
			
 
				-    if args.limit:
			
 
				-        eval_logger.warning(
			
 
				-            " --limit SHOULD ONLY BE USED FOR TESTING."
			
 
				-            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
			
 
				-        )
			
 
				-    if args.include_path is not None:
			
 
				-        eval_logger.info(f"Including path: {args.include_path}")
			
 
				-        include_path(args.include_path)
			
 
				-
			
 
				-    if args.tasks is None:
			
 
				-        task_names = ALL_TASKS
			
 
				-    elif args.tasks == "list":
			
 
				-        eval_logger.info(
			
 
				-            "Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS)))
			
 
				-        )
			
 
				-        sys.exit()
			
 
				-    else:
			
 
				-        if os.path.isdir(args.tasks):
			
 
				-            import glob
			
 
				-
			
 
				-            task_names = []
			
 
				-            yaml_path = os.path.join(args.tasks, "*.yaml")
			
 
				-            for yaml_file in glob.glob(yaml_path):
			
 
				-                config = utils.load_yaml_config(yaml_file)
			
 
				-                task_names.append(config)
			
 
				-        else:
			
 
				-            tasks_list = args.tasks.split(",")
			
 
				-            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
			
 
				-            for task in [task for task in tasks_list if task not in task_names]:
			
 
				-                if os.path.isfile(task):
			
 
				-                    config = utils.load_yaml_config(task)
			
 
				-                    task_names.append(config)
			
 
				-            task_missing = [
			
 
				-                task
			
 
				-                for task in tasks_list
			
 
				-                if task not in task_names and "*" not in task
			
 
				-            ]  # we don't want errors if a wildcard ("*") task name was used
			
 
				-
			
 
				-            if task_missing:
			
 
				-                missing = ", ".join(task_missing)
			
 
				-                eval_logger.error(
			
 
				-                    f"Tasks were not found: {missing}\n"
			
 
				-                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
			
 
				-                )
			
 
				-                raise ValueError(
			
 
				-                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
			
 
				-                )
			
 
				-
			
 
				-    if args.output_path:
			
 
				-        path = Path(args.output_path)
			
 
				-        # check if file or 'dir/results.json' exists
			
 
				-        if path.is_file() or Path(args.output_path).joinpath("results.json").is_file():
			
 
				-            eval_logger.warning(
			
 
				-                f"File already exists at {path}. Results will be overwritten."
			
 
				-            )
			
 
				-            output_path_file = path.joinpath("results.json")
			
 
				-            assert not path.is_file(), "File already exists"
			
 
				-        # if path json then get parent dir
			
 
				-        elif path.suffix in (".json", ".jsonl"):
			
 
				-            output_path_file = path
			
 
				-            path.parent.mkdir(parents=True, exist_ok=True)
			
 
				-            path = path.parent
			
 
				-        else:
			
 
				-            path.mkdir(parents=True, exist_ok=True)
			
 
				-            output_path_file = path.joinpath("results.json")
			
 
				-    elif args.log_samples and not args.output_path:
			
 
				-        assert args.output_path, "Specify --output_path"
			
 
				-
			
 
				-    eval_logger.info(f"Selected Tasks: {task_names}")
			
 
				-    print(f"type of model args: {type(args.model_args)}")
			
 
				-    print("*************************************")
			
 
				-    results = evaluator.simple_evaluate(
			
 
				-        model=args.model,
			
 
				-        model_args=args.model_args,
			
 
				-        tasks=task_names,
			
 
				-        num_fewshot=args.num_fewshot,
			
 
				-        batch_size=args.batch_size,
			
 
				-        max_batch_size=args.max_batch_size,
			
 
				-        device=args.device,
			
 
				-        use_cache=args.use_cache,
			
 
				-        limit=args.limit,
			
 
				-        decontamination_ngrams_path=args.decontamination_ngrams_path,
			
 
				-        check_integrity=args.check_integrity,
			
 
				-        write_out=args.write_out,
			
 
				-        log_samples=args.log_samples,
			
 
				-        gen_kwargs=args.gen_kwargs,
			
 
				-    )
			
 
				-
			
 
				-    if results is not None:
			
 
				-        if args.log_samples:
			
 
				-            samples = results.pop("samples")
			
 
				-        dumped = json.dumps(
			
 
				-            results, indent=2, default=_handle_non_serializable, ensure_ascii=False
			
 
				-        )
			
 
				-        if args.show_config:
			
 
				-            print(dumped)
			
 
				-
			
 
				-        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
			
 
				-
			
 
				-        if args.output_path:
			
 
				-            output_path_file.open("w").write(dumped)
			
 
				-
			
 
				-            if args.log_samples:
			
 
				-                for task_name, config in results["configs"].items():
			
 
				-                    output_name = "{}_{}".format(
			
 
				-                        re.sub("/|=", "__", args.model_args), task_name
			
 
				-                    )
			
 
				-                    filename = path.joinpath(f"{output_name}.jsonl")
			
 
				-                    samples_dumped = json.dumps(
			
 
				-                        samples[task_name],
			
 
				-                        indent=2,
			
 
				-                        default=_handle_non_serializable,
			
 
				-                        ensure_ascii=False,
			
 
				-                    )
			
 
				-                    filename.write_text(samples_dumped, encoding="utf-8")
			
 
				-
			
 
				-        print(
			
 
				-            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
			
 
				-            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
			
 
				-        )
			
 
				-        print(make_table(results))
			
 
				-        if "groups" in results:
			
 
				-            print(make_table(results, "groups"))
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    cli_evaluate()
			
--- a/eval/evaluate_lm.py
+++ b/eval/evaluate_lm.py
@@ -1,83 +0,0 @@
 
				-import fire
			
 
				-from lm_eval.base import LM
			
 
				-from lm_eval import tasks, evaluator
			
 
				-from transformers import AutoModelForCausalLM, AutoTokenizer
			
 
				-import torch
			
 
				-
			
 
				-class HuggingFaceModel(LM):
			
 
				-    def __init__(self, model_name, tokenizer_name):
			
 
				-        self.model = AutoModelForCausalLM.from_pretrained(model_name)
			
 
				-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
			
 
				-
			
 
				-    def loglikelihood(self, ctx, cont):
			
 
				-        # Encode context and continuation
			
 
				-        input_ids = self.tokenizer.encode(ctx, add_special_tokens=False)
			
 
				-        cont_ids = self.tokenizer.encode(cont, add_special_tokens=False)
			
 
				-
			
 
				-        # Concatenate context and continuation
			
 
				-        input_ids += cont_ids
			
 
				-
			
 
				-        # Calculate log likelihood
			
 
				-        with torch.no_grad():
			
 
				-            outputs = self.model(input_ids=torch.tensor([input_ids]), labels=torch.tensor([input_ids]))
			
 
				-            log_likelihood = -outputs.loss.item() * len(cont_ids)
			
 
				-
			
 
				-        return log_likelihood, len(cont_ids)
			
 
				-    
			
 
				-    @property
			
 
				-    def eot_token_id(self):
			
 
				-        return self._tokenizer.eos_id()
			
 
				-
			
 
				-    @property
			
 
				-    def max_length(self):
			
 
				-        return self._max_seq_length
			
 
				-
			
 
				-    @property
			
 
				-    def max_gen_toks(self):
			
 
				-        return 50
			
 
				-
			
 
				-    @property
			
 
				-    def batch_size(self):
			
 
				-        return 1
			
 
				-
			
 
				-    @property
			
 
				-    def device(self):
			
 
				-        return self._device
			
 
				-
			
 
				-    def tok_encode(self, string: str):
			
 
				-        encoded = encode_tokens(self._tokenizer,
			
 
				-            string, bos=True, eos=False, device=self._device)
			
 
				-        # encoded is a pytorch tensor, but some internal logic in the
			
 
				-        # eval harness expects it to be a list instead
			
 
				-        # TODO: verify this for multi-batch as well
			
 
				-        encoded = encoded.tolist()
			
 
				-        return encoded
			
 
				-
			
 
				-    def tok_decode(self, tokens):
			
 
				-        decoded = self._tokenizer.decode(tokens)
			
 
				-        return decoded
			
 
				-
			
 
				-    def _model_call(self, inputss):
			
 
				-
			
 
				-        max_new_tokens = 1
			
 
				-       
			
 
				-        logits = model(inputs)
			
 
				-        return logits
			
 
				-    
			
 
				-    def _model_generate(self, context, max_length, eos_token_id):
			
 
				-        raise Exception('unimplemented')
			
 
				-    # Implement other required methods if needed
			
 
				-
			
 
				-def evaluate_model(model_name, tokenizer_name, task_list):
			
 
				-    # Instantiate the model
			
 
				-    model = HuggingFaceModel(model_name, tokenizer_name)
			
 
				-
			
 
				-    # Convert task_list string to list
			
 
				-    task_list = task_list.split(',')
			
 
				-
			
 
				-    # Evaluate
			
 
				-    results = evaluator.evaluate(lm=model, tasks=tasks.get_task_dict(task_list), provide_description=True)
			
 
				-    print(results)
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    fire.Fire(evaluate_model)