原始代码

2025-12-05 07:11:25 +00:00
parent 045e777a11
commit dd5339de32
46 changed files with 5848 additions and 0 deletions
--- a/optimization/pycache/opt_utils.cpython-310.pyc
+++ b/optimization/pycache/opt_utils.cpython-310.pyc
--- a/optimization/opt_utils.py
+++ b/optimization/opt_utils.py
--- a/optimization/optimize_instructions.py
+++ b/optimization/optimize_instructions.py
@@ -0,0 +1,736 @@
+
+import datetime
+import functools
+import os
+import sys
+
+OPRO_ROOT_PATH = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+)
+sys.path.insert(0, OPRO_ROOT_PATH)
+
+from absl import app
+from absl import flags
+import google.generativeai as palm
+import numpy as np
+import openai
+from opro import prompt_utils
+from opro.optimization import opt_utils
+import pandas as pd
+FLAGS = flags.FLAGS
+ROOT_DATA_FOLDER_PATH = os.path.join(OPRO_ROOT_PATH, "data")
+
+flags.DEFINE_string("local_model_path", "", "Path to local vLLM model.")
+
+_OPENAI_API_KEY = flags.DEFINE_string(
+    "openai_api_key", "", "The OpenAI API key."
+)
+
+_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
+
+_SCORER = flags.DEFINE_string(
+    "scorer", "text-bison", "The name of the scorer LLM."
+)
+
+_OPTIMIZER = flags.DEFINE_string(
+    "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
+)
+
+_DATASET = flags.DEFINE_string(
+    "dataset", "gsm8k", "The name of dataset to search for instructions on."
+)
+
+_TASK = flags.DEFINE_string(
+    "task",
+    "train",
+    "The name of task within the above dataset to search for instructions on.",
+)
+
+_INSTRUCTION_POS = flags.DEFINE_string(
+    "instruction_pos",
+    "A_begin",
+    "The position of the instruction to search for.",
+)
+
+_META_PROMPT_TYPE = flags.DEFINE_string(
+    "meta_prompt_type",
+    "both_instructions_and_exemplars",
+    "The type of meta-prompt: whether to have both previous instructions and"
+    " dataset exemplars (often for fine-tuned optimizers), or to have only"
+    " previous instructions (often for pre-trained optimizers).",
+)
+
+
+def main(_):
+  local_model_path = FLAGS.local_model_path
+  openai_api_key = _OPENAI_API_KEY.value
+  palm_api_key = _PALM_API_KEY.value
+  scorer_llm_name = _SCORER.value
+  optimizer_llm_name = _OPTIMIZER.value
+  dataset_name = _DATASET.value.lower()
+  task_name = _TASK.value
+  meta_prompt_type = _META_PROMPT_TYPE.value
+
+  assert dataset_name in {
+      "mmlu",
+      "bbh",
+      "gsm8k",
+  }, "The lower-case dataset name must be one of mmlu, bbh, or gsm8k."
+  if dataset_name == "mmlu":
+    assert task_name in {
+        "STEM",
+        "humanities",
+        "social sciences",
+        "other (business, health, misc.)",
+    }  # for now only support searching on one MMLU category
+  elif dataset_name == "bbh":
+    assert task_name in {
+        "boolean_expressions",
+        "causal_judgement",
+        "date_understanding",
+        "disambiguation_qa",
+        "dyck_languages",
+        "formal_fallacies",
+        "geometric_shapes",
+        "hyperbaton",
+        "logical_deduction_five_objects",
+        "logical_deduction_seven_objects",
+        "logical_deduction_three_objects",
+        "movie_recommendation",
+        "multistep_arithmetic_two",
+        "navigate",
+        "object_counting",
+        "penguins_in_a_table",
+        "reasoning_about_colored_objects",
+        "ruin_names",
+        "salient_translation_error_detection",
+        "snarks",
+        "sports_understanding",
+        "temporal_sequences",
+        "tracking_shuffled_objects_five_objects",
+        "tracking_shuffled_objects_seven_objects",
+        "tracking_shuffled_objects_three_objects",
+        "web_of_lies",
+        "word_sorting",
+    }
+  else:
+    assert dataset_name == "gsm8k"
+    assert task_name in {"train", "test"}
+
+  assert scorer_llm_name in {
+      "text-bison",
+      "gpt-3.5-turbo",
+      "gpt-4",
+       "local",
+  }
+  assert optimizer_llm_name in {
+      "text-bison",
+      "gpt-3.5-turbo",
+      "gpt-4",
+       "local",
+  }
+  assert meta_prompt_type in {
+      "both_instructions_and_exemplars",
+      "instructions_only",
+  }
+
+  instruction_pos = _INSTRUCTION_POS.value
+  assert instruction_pos in {
+      "before_Q",
+      "Q_begin",
+      "Q_end",
+      "A_begin",
+  }, (
+      "The instruction position should be either before the question, or at the"
+      " beginning of the question, at the end of the question, or at the"
+      " beginning of the answer."
+  )
+  print(
+      f"scorer: {scorer_llm_name}, optimizer: {optimizer_llm_name}, dataset:"
+      f" {dataset_name}, task: {task_name}, instruction_pos: {instruction_pos}"
+  )
+
+
+  if scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
+      assert openai_api_key, "The OpenAI API key must be provided."
+      openai.api_key = openai_api_key
+  elif scorer_llm_name == "text-bison":
+      assert palm_api_key, "A PaLM API key is needed when prompting the text-bison model."
+      palm.configure(api_key=palm_api_key)
+  elif scorer_llm_name == "local":
+      # 本地模型，无需 API key
+      pass
+  else:
+      raise ValueError(f"Unknown scorer model: {scorer_llm_name}")
+
+  if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
+      assert openai_api_key, "The OpenAI API key must be provided."
+      openai.api_key = openai_api_key
+  elif optimizer_llm_name == "text-bison":
+      assert palm_api_key, "A PaLM API key is needed when prompting the text-bison model."
+      palm.configure(api_key=palm_api_key)
+  elif optimizer_llm_name == "local":
+      # 本地模型，无需 API key
+      pass
+  else:
+      raise ValueError(f"Unknown scorer model: {optimizer_llm_name}")
+
+
+  if dataset_name == "mmlu":
+    root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "MMLU-data")
+  elif dataset_name == "bbh":
+    root_data_folder_path = os.path.join(
+        ROOT_DATA_FOLDER_PATH, "BIG-Bench-Hard-data/"
+    )
+  else:
+    assert dataset_name == "gsm8k"
+    root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "gsm_data")
+
+  # =================== create the result directory ==========================
+  datetime_str = (
+      str(datetime.datetime.now().replace(microsecond=0))
+      .replace(" ", "-")
+      .replace(":", "-")
+  )
+
+
+  save_folder = os.path.join(
+    OPRO_ROOT_PATH,
+    "outputs",
+    "optimization-results",
+    f"{dataset_name.upper()}-{task_name}-s-{scorer_llm_name}-o-{optimizer_llm_name}-{datetime_str}/",
+)
+
+
+  result_by_instruction_folder = os.path.join(
+      save_folder, "result_by_instruction"
+  )
+  print(f"Results will be saved to: {os.path.abspath(result_by_instruction_folder)}")
+  os.makedirs(result_by_instruction_folder,exist_ok=True)
+  print(f"result directory:\n{save_folder}")
+
+  # ====================== scorer model configs ==============================
+
+
+  if scorer_llm_name == "text-bison":
+    # when prompting text-bison with Cloud API
+    scorer_finetuned_palm_temperature = 0.0
+    scorer_finetuned_palm_max_decode_steps = 1024
+    scorer_finetuned_palm_batch_size = 1
+    scorer_finetuned_palm_num_servers = 1
+    scorer_finetuned_palm_dict = dict()
+    scorer_finetuned_palm_dict["temperature"] = (
+        scorer_finetuned_palm_temperature
+    )
+    scorer_finetuned_palm_dict["num_servers"] = (
+        scorer_finetuned_palm_num_servers
+    )
+    scorer_finetuned_palm_dict["batch_size"] = scorer_finetuned_palm_batch_size
+    scorer_finetuned_palm_dict["max_decode_steps"] = (
+        scorer_finetuned_palm_max_decode_steps
+    )
+
+    call_scorer_finetuned_palm_server_func = functools.partial(
+        prompt_utils.call_palm_server_from_cloud,
+        model="text-bison-001",
+        temperature=scorer_finetuned_palm_dict["temperature"],
+        max_decode_steps=scorer_finetuned_palm_dict["max_decode_steps"],
+    )
+
+    scorer_llm_dict = {
+        "model_type": scorer_llm_name.lower(),
+    }
+    scorer_llm_dict.update(scorer_finetuned_palm_dict)
+    call_scorer_server_func = call_scorer_finetuned_palm_server_func
+
+ 
+  elif scorer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
+    # 改成调用本地vLLM版本的函数
+    scorer_gpt_max_decode_steps = 1024
+    # scorer_gpt_max_decode_steps = 512
+    scorer_gpt_temperature = 0.0
+
+    scorer_llm_dict = {
+        "model_type": scorer_llm_name.lower(),
+        "max_decode_steps": scorer_gpt_max_decode_steps,
+        "temperature": scorer_gpt_temperature,
+        "num_decodes": 1,
+        "batch_size": 1,
+        "num_servers": 1,
+    }
+
+    call_scorer_server_func = functools.partial(
+        prompt_utils.call_openai_server_func,  # 你本地实现的vLLM调用函数
+        max_decode_steps=scorer_gpt_max_decode_steps,
+        temperature=scorer_gpt_temperature,
+        local_model_path=FLAGS.local_model_path,  # 传入你本地模型路径
+    )
+
+  else:
+      raise ValueError(f"Unsupported scorer_llm_name: {scorer_llm_name}")
+
+
+  # ====================== optimizer model configs ============================
+  if optimizer_llm_name.lower() == "text-bison":
+    # when prompting text-bison with Cloud API
+    optimizer_finetuned_palm_temperature = 1.0
+    optimizer_finetuned_palm_num_decodes = 8
+    optimizer_finetuned_palm_max_decode_steps = 1024
+    optimizer_finetuned_palm_batch_size = 1
+    optimizer_finetuned_palm_num_servers = 1
+    optimizer_finetuned_palm_dict = dict()
+    optimizer_finetuned_palm_dict["temperature"] = (
+        optimizer_finetuned_palm_temperature
+    )
+    optimizer_finetuned_palm_dict["num_decodes"] = (
+        optimizer_finetuned_palm_num_decodes
+    )
+    optimizer_finetuned_palm_dict["batch_size"] = (
+        optimizer_finetuned_palm_batch_size
+    )
+    optimizer_finetuned_palm_dict["num_servers"] = (
+        optimizer_finetuned_palm_num_servers
+    )
+    optimizer_finetuned_palm_dict["max_decode_steps"] = (
+        optimizer_finetuned_palm_max_decode_steps
+    )
+
+    call_optimizer_finetuned_palm_server_func = functools.partial(
+        prompt_utils.call_palm_server_from_cloud,
+        model="text-bison-001",
+        temperature=optimizer_finetuned_palm_dict["temperature"],
+        max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
+    )
+
+    optimizer_llm_dict = {
+        "model_type": optimizer_llm_name.lower(),
+    }
+    optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
+    call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
+  
+  elif optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
+    # 用本地 vLLM 版本替代调用
+    optimizer_gpt_max_decode_steps = 512
+
+    optimizer_gpt_temperature = 1.0
+
+    optimizer_llm_dict = {
+        "max_decode_steps": optimizer_gpt_max_decode_steps,
+        "temperature": optimizer_gpt_temperature,
+        "batch_size": 1,
+        "num_decodes": 1,
+    }
+
+    call_optimizer_server_func = functools.partial(
+        prompt_utils.call_openai_server_func,  # 你写的本地vLLM调用接口
+        max_decode_steps=optimizer_gpt_max_decode_steps,
+        temperature=optimizer_gpt_temperature,
+        local_model_path=FLAGS.local_model_path,
+    )
+
+  else:
+      raise ValueError(f"Unsupported optimizer_llm_name: {optimizer_llm_name}")
+
+  
+
+  # ====================== try calling the servers ============================
+  print("\n======== testing the scorer and optimizer servers ===========")
+  scorer_test_output = call_scorer_server_func(
+      "Does the sun rise from the north? Just answer yes or no."
+  )
+  print(f"number of scorer output decodes: {len(scorer_test_output)}")
+  print(f"scorer test output: {scorer_test_output}")
+  optimizer_test_output = call_optimizer_server_func(
+      "Does the sun rise from the north? Just answer yes or no.",
+      temperature=1.0,
+  )
+  print(f"number of optimizer output decodes: {len(optimizer_test_output)}")
+  print(f"optimizer test output: {optimizer_test_output}")
+  print("Finished testing the servers.")
+
+  # ====================== read data ============================
+  print("\n================ prompt optimization settings ==============")
+  # from https://github.com/hendrycks/test/blob/master/categories.py
+  subcategories = {
+      "abstract_algebra": ["math"],
+      "anatomy": ["health"],
+      "astronomy": ["physics"],
+      "business_ethics": ["business"],
+      "clinical_knowledge": ["health"],
+      "college_biology": ["biology"],
+      "college_chemistry": ["chemistry"],
+      "college_computer_science": ["computer science"],
+      "college_mathematics": ["math"],
+      "college_medicine": ["health"],
+      "college_physics": ["physics"],
+      "computer_security": ["computer science"],
+      "conceptual_physics": ["physics"],
+      "econometrics": ["economics"],
+      "electrical_engineering": ["engineering"],
+      "elementary_mathematics": ["math"],
+      "formal_logic": ["philosophy"],
+      "global_facts": ["other"],
+      "high_school_biology": ["biology"],
+      "high_school_chemistry": ["chemistry"],
+      "high_school_computer_science": ["computer science"],
+      "high_school_european_history": ["history"],
+      "high_school_geography": ["geography"],
+      "high_school_government_and_politics": ["politics"],
+      "high_school_macroeconomics": ["economics"],
+      "high_school_mathematics": ["math"],
+      "high_school_microeconomics": ["economics"],
+      "high_school_physics": ["physics"],
+      "high_school_psychology": ["psychology"],
+      "high_school_statistics": ["math"],
+      "high_school_us_history": ["history"],
+      "high_school_world_history": ["history"],
+      "human_aging": ["health"],
+      "human_sexuality": ["culture"],
+      "international_law": ["law"],
+      "jurisprudence": ["law"],
+      "logical_fallacies": ["philosophy"],
+      "machine_learning": ["computer science"],
+      "management": ["business"],
+      "marketing": ["business"],
+      "medical_genetics": ["health"],
+      "miscellaneous": ["other"],
+      "moral_disputes": ["philosophy"],
+      "moral_scenarios": ["philosophy"],
+      "nutrition": ["health"],
+      "philosophy": ["philosophy"],
+      "prehistory": ["history"],
+      "professional_accounting": ["other"],
+      "professional_law": ["law"],
+      "professional_medicine": ["health"],
+      "professional_psychology": ["psychology"],
+      "public_relations": ["politics"],
+      "security_studies": ["politics"],
+      "sociology": ["culture"],
+      "us_foreign_policy": ["politics"],
+      "virology": ["health"],
+      "world_religions": ["philosophy"],
+  }
+
+  categories = {
+      "STEM": [
+          "physics",
+          "chemistry",
+          "biology",
+          "computer science",
+          "math",
+          "engineering",
+      ],
+      "humanities": ["history", "philosophy", "law"],
+      "social sciences": [
+          "politics",
+          "culture",
+          "economics",
+          "geography",
+          "psychology",
+      ],
+      "other (business, health, misc.)": ["other", "business", "health"],
+  }
+
+  if dataset_name == "mmlu":
+
+    category_names = [task_name]
+    folder_name = "test"  # one of {'auxiliary_train', 'dev', 'val', 'test'}
+    task_names = []
+    for task_csv_name in os.listdir(
+        os.path.join(root_data_folder_path, folder_name)
+    ):
+      task_names.append(task_csv_name.split(".")[0])
+
+    tasks_in_category = []
+    for category_name in category_names:
+      for task_name in task_names:
+        for subname in subcategories:
+          if subname in task_name:
+            if subcategories[subname][0] in categories[category_name]:
+              tasks_in_category.append(task_name)
+              break
+
+    tasks_all = [(folder_name, task_name) for task_name in tasks_in_category]
+    multiple_choice_tasks = set([item[1] for item in tasks_all])
+    boolean_tasks = set()
+    numerical_output_tasks = set()
+
+    
+  elif dataset_name == "bbh":
+    tasks_all = [task_name]
+    assert (
+        len(tasks_all) == 1
+    ), "for now only support prompt optimization on one BBH task"
+
+    
+    numerical_output_tasks = {
+        "object_counting",
+        "multistep_arithmetic_two",
+    }
+
+    multiple_choice_tasks = {
+        "date_understanding",
+        "disambiguation_qa",
+        "geometric_shapes",
+        "hyperbaton",
+        "logical_deduction_five_objects",
+        "logical_deduction_seven_objects",
+        "logical_deduction_three_objects",
+        "movie_recommendation",
+        "penguins_in_a_table",
+        "reasoning_about_colored_objects",
+        "ruin_names",
+        "salient_translation_error_detection",
+        "snarks",
+        "temporal_sequences",
+        "tracking_shuffled_objects_five_objects",
+        "tracking_shuffled_objects_seven_objects",
+        "tracking_shuffled_objects_three_objects",
+    }
+
+    boolean_tasks = {
+        "boolean_expressions",  # True or False
+        "causal_judgement",  # yes or no
+        "formal_fallacies",  # valid or invalid
+        "navigate",  # yes or no
+        "sports_understanding",  # yes or no
+        "web_of_lies",  # yes or no
+    }
+
+  else:
+    assert dataset_name in {"gsm8k"}
+    tasks_all = [task_name]
+    multiple_choice_tasks = set()
+    boolean_tasks = set()
+    numerical_output_tasks = set(tasks_all)
+
+  if dataset_name == "mmlu":
+    raw_data = pd.DataFrame()
+    prediction_treat_as_number = False
+    prediction_treat_as_bool = False
+  elif dataset_name == "bbh":
+    raw_data = []
+    prediction_treat_as_number = bool(
+        tasks_all[0] in numerical_output_tasks
+    )  # for now only check the first task
+    prediction_treat_as_bool = bool(
+        tasks_all[0] in boolean_tasks
+    )  # for now only check the first task
+    print(
+        f"prediction_treat_as_number: {prediction_treat_as_number},"
+        f" prediction_treat_as_bool: {prediction_treat_as_bool}"
+    )
+  else:
+    assert dataset_name == "gsm8k"
+    raw_data = pd.DataFrame()
+    prediction_treat_as_number = True
+    prediction_treat_as_bool = False
+
+  for t in tasks_all:
+    if dataset_name == "mmlu":
+      folder_name = t[0]
+      task_name = t[1]
+      single_task_df = pd.read_csv(
+          os.path.join(root_data_folder_path, f"{folder_name}/{task_name}.csv"),
+          index_col=None,
+          header=None,
+      )
+      raw_data = pd.concat([raw_data, single_task_df])
+    elif dataset_name == "bbh":
+      task_name = t
+      single_task_list = opt_utils.load_bbh_task_data(
+          task_name, base_dir=root_data_folder_path
+      )
+      raw_data += single_task_list
+    else:
+      assert dataset_name == "gsm8k"
+      task_name = t
+      f_gsm = os.path.join(root_data_folder_path, f"gsm_{task_name}.tsv")
+      single_task_df = pd.read_csv(f_gsm, sep="\t", header=None)
+      raw_data = pd.concat([raw_data, single_task_df])
+
+  if dataset_name == "mmlu":
+    num_examples = raw_data.shape[0]
+  elif dataset_name == "bbh":
+    num_examples = len(raw_data)
+  else:
+    assert dataset_name in {"gsm8k"}
+    num_examples = raw_data.shape[0]
+  print(f"number of examples in the current task: {num_examples}")
+
+  # ================ split data into train/val/test ==========================
+  if dataset_name == "mmlu":
+    train_ratio = 0.8
+    
+    eval_ratio = 0.2
+  elif dataset_name == "gsm8k":
+#    train_ratio = 0.035
+    train_ratio = 0.01               # 原来是 0.035，改成 0.01，约 74 条
+    eval_ratio = 0
+  else:
+    assert dataset_name == "bbh"
+    train_ratio = 0.2
+    eval_ratio = 0
+
+ 
+  assert train_ratio + eval_ratio <= 1
+  test_ratio = 1 - train_ratio - eval_ratio
+  print(
+      f"train_ratio: {train_ratio}, eval_ratio: {eval_ratio}, "
+      f"test_ratio: {test_ratio}"
+  )
+  np.random.seed(0)
+  train_index = np.sort(
+      np.array(
+          np.random.choice(
+              num_examples, size=int(train_ratio * num_examples), replace=False
+          )
+      )
+  )
+  eval_and_test_index = np.sort(
+      np.array(list(set(np.arange(num_examples)) - set(train_index)))
+  )
+  eval_index = np.sort(
+      np.array(
+          np.random.choice(
+              eval_and_test_index,
+              size=int(eval_ratio * num_examples),
+              replace=False,
+          )
+      )
+  )
+
+  # ========== set other optimization experiment hyperparameters ==============
+  if scorer_llm_name == "text-bison":
+    old_instruction_score_threshold = 0.0
+    # old_instruction_score_threshold = 0.15  # for GSM8K
+  else:
+    assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4", "local"}
+    old_instruction_score_threshold = 0.3
+
+  if scorer_llm_name == "text-bison":
+    extract_final_answer_by_prompting_again = False
+    include_qa = False
+    evaluate_in_parallel = False
+  else:
+    assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4", "local"}
+    extract_final_answer_by_prompting_again = False
+    include_qa = False
+    evaluate_in_parallel = False
+
+  optimizer_llm_temperature = optimizer_llm_dict["temperature"]
+
+
+#  num_few_shot_questions_for_instruction_refinement = 3
+  num_few_shot_questions_for_instruction_refinement = 1  # 减少 few-shot 例子数
+
+#  num_generated_instructions_in_each_step = 8
+  num_generated_instructions_in_each_step = 2  # 每步只生成 2 条指令
+
+#  num_search_steps = 200
+  num_search_steps = 3            # 原来是 200，改成 3 步即可
+
+
+  initial_instructions = [
+      "Let's solve the problem.",
+      # "",
+      # "The answer is",
+  ]
+  few_shot_qa_pairs = True
+  # one of {'accumulative_most_frequent', 'current_most_frequent', 'random',
+  # 'constant'}
+  few_shot_selection_criteria = "random"
+  # whether to evaluate generated instructions on the exemplars in meta-prompt
+  evaluate_generated_ins_on_few_shot = False
+  # whether to evaluate old instructions on the exemplars in the meta-prompt
+  evaluate_old_ins_on_few_shot = False
+  # every this number of steps, compute the accuracies of current-step
+  # instructions on the validation set
+#  eval_interval = 3
+  eval_interval = 1               # 每步就 eval 一次，及时看到结果
+  # eval_interval = 10
+  max_num_instructions = (
+      20  # the maximum number of instructions and scores in the meta-prompt
+  )
+  # The number of buckets when converting scores to integers in the meta-prompt.
+  num_score_buckets = 100
+  # whether to put old instructions and scores to before exemplars in
+  # the meta-prompt
+  meta_prompt_instructions_before_exemplars = True
+
+  # ===================== run prompt optimization ======================
+
+  assert few_shot_selection_criteria in {
+      "accumulative_most_frequent",
+      "current_most_frequent",
+      "random",
+      "constant",
+  }
+  evolution_kwargs = {
+      "num_search_steps": num_search_steps,
+      "old_instruction_score_threshold": old_instruction_score_threshold,
+      "scorer_llm_dict": scorer_llm_dict,
+      "optimizer_llm_dict": optimizer_llm_dict,
+      "extract_final_answer_by_prompting_again": (
+          extract_final_answer_by_prompting_again
+      ),
+      "include_qa": include_qa,
+      "evaluate_in_parallel": evaluate_in_parallel,
+      "tasks_all": tasks_all,
+      "train_ratio": train_ratio,
+      "eval_ratio": eval_ratio,
+      "test_ratio": test_ratio,
+      "train_index": train_index,
+      "eval_index": eval_index,
+      "dataset_name": dataset_name,
+      "task_name": task_name,
+      "num_examples": num_examples,
+      "root_data_folder_path": root_data_folder_path,
+      "optimizer_llm_temperature": optimizer_llm_temperature,
+      # "optimizer_llm_temperature_schedule": (
+      #     optimizer_llm_temperature_schedule
+      # ),
+      # "optimizer_llm_temperature_end": optimizer_llm_temperature_end,
+      "initial_instructions": initial_instructions,
+      "multiple_choice_tasks": multiple_choice_tasks,
+      "raw_data": raw_data,
+      "call_scorer_server_func": call_scorer_server_func,
+      "call_optimizer_server_func": call_optimizer_server_func,
+      "instruction_pos": instruction_pos,
+      "prediction_treat_as_number": prediction_treat_as_number,
+      "prediction_treat_as_bool": prediction_treat_as_bool,
+      "result_by_instruction_folder": result_by_instruction_folder,
+      "few_shot_qa_pairs": few_shot_qa_pairs,
+      "num_score_buckets": num_score_buckets,
+      "max_num_instructions": max_num_instructions,
+      "meta_prompt_type": meta_prompt_type,
+      "meta_prompt_instructions_before_exemplars": (
+          meta_prompt_instructions_before_exemplars
+      ),
+      "few_shot_selection_criteria": few_shot_selection_criteria,
+      "optimizer_llm_name": optimizer_llm_name,
+      "num_generated_instructions_in_each_step": (
+          num_generated_instructions_in_each_step
+      ),
+      "evaluate_generated_ins_on_few_shot": evaluate_generated_ins_on_few_shot,
+      "num_few_shot_questions_for_instruction_refinement": (
+          num_few_shot_questions_for_instruction_refinement
+      ),
+      "evaluate_old_ins_on_few_shot": evaluate_old_ins_on_few_shot,
+      "eval_interval": eval_interval,
+      "save_folder": save_folder,
+  }
+  print("=== 开始优化过程 ===")
+  try:
+    opt_utils.run_evolution(**evolution_kwargs)
+    print("=== 优化完成 ===")
+  except Exception as e:
+    import traceback
+    print(f"!!! 优化失败: {e} !!!", file=sys.stderr)
+    traceback.print_exc()
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+  app.run(main)
+
--- a/optimization/optimize_linear_regression.py
+++ b/optimization/optimize_linear_regression.py
@@ -0,0 +1,424 @@
+# Copyright 2023 The OPRO Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Optimize over the objective function of a linear regression problem.
+
+Usage:
+
+```
+python optimize_linear_regression.py --optimizer="text-bison"
+```
+
+Note:
+- When using a Google-Cloud-served model (like text-bison at
+https://developers.generativeai.google/tutorials/text_quickstart), add
+`--palm_api_key="<your_key>"`
+- When using an OpenAI model, add `--openai_api_key="<your_key>"`
+"""
+
+import datetime
+import functools
+import json
+import os
+import re
+import sys
+
+OPRO_ROOT_PATH = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+)
+sys.path.insert(0, OPRO_ROOT_PATH)
+
+from absl import app
+from absl import flags
+import google.generativeai as palm
+import numpy as np
+import openai
+
+from opro import prompt_utils
+
+_OPENAI_API_KEY = flags.DEFINE_string(
+    "openai_api_key", "", "The OpenAI API key."
+)
+
+_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
+
+_OPTIMIZER = flags.DEFINE_string(
+    "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
+)
+
+
+def main(_):
+  # ============== set optimization experiment configurations ================
+  num_points = 50  # number of points in linear regression
+  w_true = 15  # the true w
+  b_true = 14  # the true b
+  max_num_steps = 500  # the number of optimization steps
+  num_reps = 5  # the number of repeated runs
+  max_num_pairs = 20  # the maximum number of input-output pairs in meta-prompt
+  num_input_decimals = 0  # num of decimals for input values in meta-prompt
+  num_output_decimals = 0  # num of decimals for output values in meta-prompt
+  num_generated_points_in_each_step = 8
+
+  # ================ load LLM settings ===================
+  optimizer_llm_name = _OPTIMIZER.value
+  assert optimizer_llm_name in {
+      "text-bison",
+      "gpt-3.5-turbo",
+      "gpt-4",
+  }
+  openai_api_key = _OPENAI_API_KEY.value
+  palm_api_key = _PALM_API_KEY.value
+
+  if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
+    assert openai_api_key, "The OpenAI API key must be provided."
+    openai.api_key = openai_api_key
+  else:
+    assert optimizer_llm_name == "text-bison"
+    assert (
+        palm_api_key
+    ), "A PaLM API key is needed when prompting the text-bison model."
+    palm.configure(api_key=palm_api_key)
+
+  # =================== create the result directory ==========================
+  datetime_str = (
+      str(datetime.datetime.now().replace(microsecond=0))
+      .replace(" ", "-")
+      .replace(":", "-")
+  )
+
+  save_folder = os.path.join(
+      OPRO_ROOT_PATH,
+      "outputs",
+      "optimization-results",
+      f"linear_regression-o-{optimizer_llm_name}-{datetime_str}/",
+  )
+  os.makedirs(save_folder)
+  print(f"result directory:\n{save_folder}")
+
+  # ====================== optimizer model configs ============================
+  if optimizer_llm_name.lower() == "text-bison":
+    # when prompting text-bison with Cloud API
+    optimizer_finetuned_palm_temperature = 1.0
+    optimizer_finetuned_palm_max_decode_steps = 1024
+    optimizer_finetuned_palm_batch_size = 1
+    optimizer_finetuned_palm_num_servers = 1
+    optimizer_finetuned_palm_dict = dict()
+    optimizer_finetuned_palm_dict["temperature"] = (
+        optimizer_finetuned_palm_temperature
+    )
+    optimizer_finetuned_palm_dict["batch_size"] = (
+        optimizer_finetuned_palm_batch_size
+    )
+    optimizer_finetuned_palm_dict["num_servers"] = (
+        optimizer_finetuned_palm_num_servers
+    )
+    optimizer_finetuned_palm_dict["max_decode_steps"] = (
+        optimizer_finetuned_palm_max_decode_steps
+    )
+
+    call_optimizer_finetuned_palm_server_func = functools.partial(
+        prompt_utils.call_palm_server_from_cloud,
+        # prompt_utils.call_vllm,
+        model="text-bison-001",
+        temperature=optimizer_finetuned_palm_dict["temperature"],
+        max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
+    )
+
+    optimizer_llm_dict = {
+        "model_type": optimizer_llm_name.lower(),
+    }
+    optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
+    call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
+
+  else:
+    assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
+    optimizer_gpt_max_decode_steps = 1024
+    optimizer_gpt_temperature = 1.0
+
+    optimizer_llm_dict = dict()
+    optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps
+    optimizer_llm_dict["temperature"] = optimizer_gpt_temperature
+    optimizer_llm_dict["batch_size"] = 1
+    call_optimizer_server_func = functools.partial(
+        prompt_utils.call_openai_server_func,
+        model=optimizer_llm_name,
+        max_decode_steps=optimizer_gpt_max_decode_steps,
+        temperature=optimizer_gpt_temperature,
+    )
+
+  # ====================== try calling the servers ============================
+  print("\n======== testing the optimizer server ===========")
+  optimizer_test_output = call_optimizer_server_func(
+      "Does the sun rise from the north? Just answer yes or no.",
+      temperature=1.0,
+  )
+  print(f"optimizer test output: {optimizer_test_output}")
+  print("Finished testing the optimizer server.")
+  print("\n=================================================")
+
+  # ====================== utility functions ============================
+  def evaluate_loss(X, y, w, b):  # pylint: disable=invalid-name
+    residual = y - (X * w + b)
+    return np.linalg.norm(residual) ** 2
+
+  def gen_meta_prompt(
+      old_value_pairs_set,
+      X,  # pylint: disable=invalid-name, unused-argument
+      y,  # pylint: disable=unused-argument
+      num_input_decimals=5,
+      num_output_decimals=5,
+      max_num_pairs=100,
+  ):
+    """Generate the meta-prompt for optimization.
+
+    Args:
+     old_value_pairs_set (set): the set of old (w, b, z) pairs.
+     X (np.array): the 1D array of x values.
+     y (np.array): the 1D array of y values.
+     num_input_decimals (int): the number of decimals for (w, b) in the
+       meta-prompt.
+     num_output_decimals (int): the number of decimals for z in the meta-prompt.
+     max_num_pairs (int): the maximum number of exemplars in the meta-prompt.
+
+    Returns:
+      meta_prompt (str): the generated meta-prompt.
+    """
+    old_value_pairs_set = set(
+        [  # pylint: disable=g-complex-comprehension
+            (
+                np.round(w, num_input_decimals)
+                if num_input_decimals > 0
+                else int(w),
+                np.round(b, num_input_decimals)
+                if num_input_decimals > 0
+                else int(b),
+                np.round(z, num_output_decimals)
+                if num_output_decimals > 0
+                else int(z),
+            )
+            for w, b, z in old_value_pairs_set
+        ]
+    )
+    old_value_pairs = list(old_value_pairs_set)
+    old_value_pairs = sorted(old_value_pairs, key=lambda x: -x[2])[
+        -max_num_pairs:
+    ]
+    old_value_pairs_substr = ""
+    for w, b, z in old_value_pairs:
+      old_value_pairs_substr += f"\ninput:\nw={w}, b={b}\nvalue:\n{z}\n"
+    meta_prompt = """
+  Now you will help me minimize a function with two input variables w, b. I have some (w, b) pairs and the function values at those points. The pairs are arranged in descending order based on their function values, where lower values are better.
+    """.strip()
+    meta_prompt += "\n\n"
+    meta_prompt += old_value_pairs_substr.strip()
+    meta_prompt += "\n\n"
+    # function_analytic_form = ""
+    # for xi, yi in zip(X, y):
+    #   function_analytic_form += f"({yi:.4f} - ({xi:.4f} * w + b)) ** 2 + "
+    # function_analytic_form = function_analytic_form[:-3]
+    # meta_prompt += (
+    #     "The function has the analytic form f(w, b) ="
+    #     f" {function_analytic_form}. When evaluating the value of a (w, b)"
+    #     " pair, you should replace the w and b in the analytic form with your"
+    #     " values and do the computation."
+    # )
+    # meta_prompt += "\n\n"
+    meta_prompt += """Give me a new (w, b) pair that is different from all pairs above, and has a function value lower than any of the above. Do not write code. The output must end with a pair [w, b], where w and b are numerical values.
+    """.strip()
+    return meta_prompt
+
+  def extract_string_in_square_brackets(input_string):
+    raw_result = re.findall(r"\[.*?\]", input_string)
+    if raw_result:
+      for pair in raw_result[::-1]:
+        if "=" not in pair and ("w" in pair or "b" in pair):
+          continue
+        return pair[1:-1]
+      return ""
+    else:
+      return ""
+
+  def parse_output(extracted_output):
+    """Parse the extracted output 'w, b' string to np.array([w, b]).
+
+    Args:
+      extracted_output (str): the extracted output string, like '1.5, 2.5'.
+
+    Returns:
+      parsed_output (np.array): the parsed output in a numpy array, like [1.5,
+      2.5].
+    """
+    if not extracted_output:
+      return
+    extracted_values = []
+    for item in extracted_output.split(","):
+      if "=" in item:
+        item = item[item.index("=") + 1 :]
+      extracted_values.append(item.strip())
+    parsed_output = np.array(extracted_values).astype(float)
+    return parsed_output
+
+  configs_dict = dict()
+  results_dict = dict()
+  num_convergence_steps = []
+  for i_rep in range(num_reps):
+    found_optimal = False
+    print(f"\nRep {i_rep}:")
+
+    # ================= generate the ground truth X, y =====================
+    X = np.arange(num_points).astype(float) + 1  # pylint: disable=invalid-name
+    np.random.seed(i_rep + 1)
+    y = X * w_true + b_true + np.random.randn(num_points)
+    loss_at_true_values = evaluate_loss(X, y, w_true, b_true)
+    print(f"value at (w_true, b_true): {loss_at_true_values}")
+
+    # ================= generate the starting points =====================
+    num_starting_points = 5  # the number of initial points for optimization
+    np.random.seed((i_rep + 1) * 10)
+    init_w = np.random.uniform(low=10, high=20, size=num_starting_points)
+    np.random.seed((i_rep + 1) * 100)
+    init_b = np.random.uniform(low=10, high=20, size=num_starting_points)
+
+    # ====================== run optimization ============================
+    configs_dict_single_rep = {
+        "optimizer_llm_configs": optimizer_llm_dict,
+        "data": {
+            "num_points": num_points,
+            "w_true": w_true,
+            "b_true": b_true,
+            "loss_at_true_values": loss_at_true_values,
+            "X": list(X),
+            "y": list(y),
+        },
+        "init_w": list(init_w),
+        "init_b": list(init_b),
+        "max_num_steps": max_num_steps,
+        "max_num_pairs": max_num_pairs,
+        "num_input_decimals": num_input_decimals,
+        "num_output_decimals": num_output_decimals,
+        "num_generated_points_in_each_step": num_generated_points_in_each_step,
+    }
+    configs_dict[i_rep] = configs_dict_single_rep
+    configs_json_path = os.path.join(save_folder, "configs.json")
+    print(f"saving configs to\n{configs_json_path}")
+    with open(configs_json_path, "w") as f:
+      json.dump(configs_dict, f, indent=4)
+
+    old_value_pairs_set = set()
+    old_value_pairs_with_i_step = []  # format: [(w, b, z = f(w, b), i_step)]
+    meta_prompts_dict = dict()  # format: {i_step: meta_prompt}
+    raw_outputs_dict = dict()  # format: {i_step: raw_outputs}
+
+    rounded_inits = [
+        (np.round(w, num_input_decimals), np.round(b, num_input_decimals))
+        for w, b in zip(init_w, init_b)
+    ]
+    rounded_inits = [
+        tuple(item) for item in list(np.unique(rounded_inits, axis=0))
+    ]
+    for w, b in rounded_inits:
+      z = evaluate_loss(X, y, w, b)
+      old_value_pairs_set.add((w, b, z))
+      old_value_pairs_with_i_step.append((w, b, z, -1))
+
+    print("\n================ run optimization ==============")
+    print(
+        f"initial points: {[tuple(item[:2]) for item in old_value_pairs_set]}"
+    )
+    print(f"initial values: {[item[-1] for item in old_value_pairs_set]}")
+    results_json_path = os.path.join(save_folder, "results.json")
+    print(f"saving results to\n{results_json_path}")
+
+    for i_step in range(max_num_steps):
+      print(f"\nStep {i_step}:")
+      meta_prompt = gen_meta_prompt(
+          old_value_pairs_set,
+          X,
+          y,
+          num_input_decimals=num_input_decimals,
+          num_output_decimals=num_output_decimals,
+          max_num_pairs=max_num_pairs,
+      )
+      if not i_step % 5:
+        print("\n=================================================")
+        print(f"meta_prompt:\n{meta_prompt}")
+      meta_prompts_dict[i_step] = meta_prompt
+
+      # generate a maximum of the given number of points in each step
+      remaining_num_points_to_generate = num_generated_points_in_each_step
+      raw_outputs = []
+      while remaining_num_points_to_generate > 0:
+        raw_outputs += call_optimizer_server_func(meta_prompt)
+        remaining_num_points_to_generate -= optimizer_llm_dict["batch_size"]
+      raw_outputs = raw_outputs[:num_generated_points_in_each_step]
+
+      raw_outputs_dict[i_step] = raw_outputs
+      parsed_outputs = []
+      for string in raw_outputs:
+        if not i_step % 5:
+          print("\n=================================================")
+          print("raw output:\n", string)
+          print("\n=================================================")
+        try:
+          parsed_output = parse_output(
+              extract_string_in_square_brackets(string)
+          )
+          if parsed_output is not None and len(parsed_output) == 2:
+            parsed_outputs.append(parsed_output)
+        except ValueError:
+          pass
+      parsed_outputs = [tuple(item) for item in parsed_outputs]
+      print(f"proposed points before rounding: {parsed_outputs}")
+
+      # round the proposed points to the number of decimals in meta-prompt
+      rounded_outputs = [
+          (np.round(w, num_input_decimals), np.round(b, num_input_decimals))
+          for w, b in parsed_outputs
+      ]
+      rounded_outputs = [
+          tuple(item) for item in list(np.unique(rounded_outputs, axis=0))
+      ]
+      print(f"proposed points after rounding: {rounded_outputs}")
+
+      # evaluate the values of proposed and rounded outputs
+      single_step_values = []
+      for w, b in rounded_outputs:
+        if w == w_true and b == b_true:
+          found_optimal = True
+        z = evaluate_loss(X, y, w, b)
+        single_step_values.append(z)
+        old_value_pairs_set.add((w, b, z))
+        old_value_pairs_with_i_step.append((w, b, z, i_step))
+      print(f"single_step_values: {single_step_values}")
+
+      # ====================== save results ============================
+      results_dict_single_rep = {
+          "meta_prompts": meta_prompts_dict,
+          "raw_outputs": raw_outputs_dict,
+          "old_value_pairs_with_i_step": old_value_pairs_with_i_step,
+      }
+      results_dict[i_rep] = results_dict_single_rep
+      with open(results_json_path, "w") as f:
+        json.dump(results_dict, f, indent=4)
+      if found_optimal:
+        print(
+            f"Repetition {i_rep+1}, optimal found at Step {i_step+1}, saving"
+            f" final results to\n{save_folder}"
+        )
+        num_convergence_steps.append(i_step + 1)
+        break
+  print(f"num_convergence_steps: {num_convergence_steps}")
+
+
+if __name__ == "__main__":
+  app.run(main)
--- a/optimization/optimize_tsp.py
+++ b/optimization/optimize_tsp.py
@@ -0,0 +1,430 @@
+# Copyright 2024 The OPRO Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Optimize over the objective function of a traveling salesman problem.
+
+Usage:
+
+```
+python optimize_tsp.py --optimizer="text-bison"
+```
+
+Note:
+- When using a Google-Cloud-served model (like text-bison at
+https://developers.generativeai.google/tutorials/text_quickstart), add
+`--palm_api_key="<your_key>"`
+- When using an OpenAI model, add `--openai_api_key="<your_key>"`
+"""
+
+import datetime
+import functools
+import getpass
+import json
+import os
+import re
+import sys
+import itertools
+
+OPRO_ROOT_PATH = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+)
+sys.path.insert(0, OPRO_ROOT_PATH)
+
+from absl import app
+from absl import flags
+import google.generativeai as palm
+import numpy as np
+import openai
+
+from opro import prompt_utils
+
+
+_OPENAI_API_KEY = flags.DEFINE_string(
+    "openai_api_key", "", "The OpenAI API key."
+)
+
+_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
+
+_OPTIMIZER = flags.DEFINE_string(
+    "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
+)
+
+_START_ALGORITHM = flags.DEFINE_string(
+    "starting_algorithm", "farthest_insertion", "The name of the starting algorithm. Select from [dp, nearest_neighbor, farthest_insertion]"
+)
+
+def main(_):
+  # ============== set optimization experiment configurations ================
+  num_points = 100  # number of points in TSP
+  num_steps = 500  # the number of optimization steps
+  max_num_pairs = 10  # the maximum number of input-output pairs in meta-prompt
+  num_decimals = 0  # num of decimals for distances in meta-prompt
+  num_starting_points = 5  # the number of initial points for optimization
+  num_decode_per_step = 8 # the number of decoded solutions per step
+
+  # ================ load LLM settings ===================
+  optimizer_llm_name = _OPTIMIZER.value
+  assert optimizer_llm_name in {
+      "text-bison",
+      "gpt-3.5-turbo",
+      "gpt-4",
+  }
+  openai_api_key = _OPENAI_API_KEY.value
+  palm_api_key = _PALM_API_KEY.value
+
+  if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
+    assert openai_api_key, "The OpenAI API key must be provided."
+    openai.api_key = openai_api_key
+  else:
+    assert optimizer_llm_name == "text-bison"
+    assert (
+        palm_api_key
+    ), "A PaLM API key is needed when prompting the text-bison model."
+    palm.configure(api_key=palm_api_key)
+
+  # =================== create the result directory ==========================
+  datetime_str = (
+      str(datetime.datetime.now().replace(microsecond=0))
+      .replace(" ", "-")
+      .replace(":", "-")
+  )
+
+  save_folder = os.path.join(
+      OPRO_ROOT_PATH,
+      "outputs",
+      "optimization-results",
+      f"tsp-o-{optimizer_llm_name}-{datetime_str}/",
+  )
+  os.makedirs(save_folder)
+  print(f"result directory:\n{save_folder}")
+
+  # ====================== optimizer model configs ============================
+  if optimizer_llm_name.lower() == "text-bison":
+    # when prompting text-bison with Cloud API
+    optimizer_finetuned_palm_temperature = 1.0
+    optimizer_finetuned_palm_max_decode_steps = 1024
+    optimizer_finetuned_palm_batch_size = 1
+    optimizer_finetuned_palm_num_servers = 1
+    optimizer_finetuned_palm_dict = dict()
+    optimizer_finetuned_palm_dict["temperature"] = (
+        optimizer_finetuned_palm_temperature
+    )
+    optimizer_finetuned_palm_dict["batch_size"] = (
+        optimizer_finetuned_palm_batch_size
+    )
+    optimizer_finetuned_palm_dict["num_servers"] = (
+        optimizer_finetuned_palm_num_servers
+    )
+    optimizer_finetuned_palm_dict["max_decode_steps"] = (
+        optimizer_finetuned_palm_max_decode_steps
+    )
+
+    call_optimizer_finetuned_palm_server_func = functools.partial(
+        prompt_utils.call_palm_server_from_cloud,
+        # prompt_utils.call_vllm,
+        model="text-bison-001",
+        temperature=optimizer_finetuned_palm_dict["temperature"],
+        max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
+    )
+
+    optimizer_llm_dict = {
+        "model_type": optimizer_llm_name.lower(),
+    }
+    optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
+    call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
+
+  else:
+    assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
+    optimizer_gpt_max_decode_steps = 1024
+    optimizer_gpt_temperature = 1.0
+
+    optimizer_llm_dict = dict()
+    optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps
+    optimizer_llm_dict["temperature"] = optimizer_gpt_temperature
+    optimizer_llm_dict["batch_size"] = 1
+    call_optimizer_server_func = functools.partial(
+        prompt_utils.call_openai_server_func,
+        model=optimizer_llm_name,
+        max_decode_steps=optimizer_gpt_max_decode_steps,
+        temperature=optimizer_gpt_temperature,
+    )
+
+  # ====================== try calling the servers ============================
+  print("\n======== testing the optimizer server ===========")
+  optimizer_test_output = call_optimizer_server_func(
+      "Does the sun rise from the north? Just answer yes or no.",
+      temperature=1.0,
+  )
+  print(f"optimizer test output: {optimizer_test_output}")
+  print("Finished testing the optimizer server.")
+  print("\n=================================================")
+
+  # ====================== utility functions ============================
+  def evaluate_distance(x, y, trace, num_decimals):  # pylint: disable=invalid-name
+    dis = 0
+    try:
+      for i in range(len(trace) - 1):
+        id0 = trace[i]
+        id1 = trace[i + 1]
+        dis += np.sqrt((x[id0] - x[id1]) ** 2 + (y[id0] - y[id1]) ** 2)
+    except:
+      return -1
+    id0 = trace[-1]
+    id1 = trace[0]
+    dis += np.sqrt((x[id0] - x[id1]) ** 2 + (y[id0] - y[id1]) ** 2)
+    dis = np.round(dis, num_decimals) if num_decimals > 0 else int(dis)
+    return dis
+
+  def solve_tsp(x, y, num_points, num_decimals, starting_algorithm):
+    if starting_algorithm == "nearest_neighbor":
+      min_dis = 0
+      gt_sol = [0]
+      remaining_points = list(range(1, num_points))
+      while len(remaining_points) > 0:
+        min_p = -1
+        min_cur_dis = -1
+        for p in remaining_points:
+          cur_dis = np.sqrt((x[p] - x[gt_sol[-1]]) ** 2 + (y[p] - y[gt_sol[-1]]) ** 2)
+          if min_p == -1 or cur_dis < min_cur_dis:
+            min_p = p
+            min_cur_dis = cur_dis
+        gt_sol.append(min_p)
+        min_dis += min_cur_dis
+        remaining_points.remove(min_p)
+      min_dis += np.sqrt((x[0] - x[gt_sol[-1]]) ** 2 + (y[0] - y[gt_sol[-1]]) ** 2)
+      min_dis = np.round(min_dis, num_decimals) if num_decimals > 0 else int(min_dis)
+      return gt_sol, min_dis
+    elif starting_algorithm == "farthest_insertion":
+      gt_sol = [0]
+      remaining_points = list(range(1, num_points))
+      while len(remaining_points) > 0:
+        max_p = -1
+        max_cur_dis = -1
+        max_cur_index = -1
+        for p in remaining_points:
+          min_cur_dis = -1
+          min_cur_index = -1
+          for index in range(1, len(gt_sol) + 1):
+            new_sol = gt_sol[:index] + [p] + gt_sol[index:]
+            cur_dis = evaluate_distance(x, y, new_sol, num_decimals)
+            if min_cur_dis == -1 or cur_dis < min_cur_dis:
+              min_cur_dis = cur_dis
+              min_cur_index = index
+          if max_cur_dis == -1 or min_cur_dis > max_cur_dis:
+            max_p = p
+            max_cur_dis = min_cur_dis
+            max_cur_index = min_cur_index
+        gt_sol = gt_sol[:max_cur_index] + [max_p] + gt_sol[max_cur_index:]
+        remaining_points.remove(max_p)
+      min_dis = evaluate_distance(x, y, gt_sol, num_decimals)
+      return gt_sol, min_dis
+      
+    f = {(0, 1): (0, [0])}
+    q = [(0, 1)]
+    min_dis = -1
+    gt_sol = list(range(num_points))
+    while len(q) > 0:
+      p, status = q[0]
+      q = q[1:]
+      for i in range(num_points):
+        if 2 << i >> 1 & status == 0:
+          new_status = status + (2 << i >> 1)
+          new_dis = f[(p, status)][0] + np.sqrt((x[i] - x[p]) ** 2 + (y[i] - y[p]) ** 2)
+          if (i, new_status) not in f or new_dis < f[(i, new_status)][0]:
+            f[(i, new_status)] = (new_dis, f[(p, status)][1] + [i])
+            if new_status == (2 << num_points >> 1) - 1:
+              new_dis += np.sqrt((x[i] - x[0]) ** 2 + (y[i] - y[0]) ** 2)
+              if min_dis == -1 or new_dis < min_dis:
+                min_dis = new_dis
+                gt_sol = f[(i, new_status)][1][:]
+            elif (i, new_status) not in q:
+              q.append((i, new_status))
+    min_dis = np.round(min_dis, num_decimals) if num_decimals > 0 else int(min_dis)
+    return gt_sol, min_dis
+
+  def gen_meta_prompt(
+      old_value_pairs_set,
+      x,  # pylint: disable=invalid-name
+      y,
+      max_num_pairs=100,
+  ):
+    """Generate the meta-prompt for optimization.
+
+    Args:
+     old_value_pairs_set (set): the set of old traces.
+     X (np.array): the 1D array of x values.
+     y (np.array): the 1D array of y values.
+     num_decimals (int): the number of decimals in the
+       meta-prompt.
+     max_num_pairs (int): the maximum number of exemplars in the meta-prompt.
+
+    Returns:
+      meta_prompt (str): the generated meta-prompt.
+    """
+    old_value_pairs = list(old_value_pairs_set)
+    old_value_pairs = sorted(old_value_pairs, key=lambda x: -x[1])[
+        -max_num_pairs:
+    ]
+    old_value_pairs_substr = ""
+    for trace, dis in old_value_pairs:
+      old_value_pairs_substr += f"\n<trace> {trace} </trace>\nlength:\n{dis}\n"
+    meta_prompt = "You are given a list of points with coordinates below:\n"
+    for i, (xi, yi) in enumerate(zip(x, y)):
+      if i:
+        meta_prompt += ", "
+      meta_prompt += f"({i}): ({xi}, {yi})"
+    meta_prompt += ".\n\nBelow are some previous traces and their lengths. The traces are arranged in descending order based on their lengths, where lower values are better.".strip()
+    meta_prompt += "\n\n"
+    meta_prompt += old_value_pairs_substr.strip()
+    meta_prompt += "\n\n"
+    meta_prompt += """Give me a new trace that is different from all traces above, and has a length lower than any of the above. The trace should traverse all points exactly once. The trace should start with '<trace>' and end with </trace>.
+    """.strip()
+    return meta_prompt
+
+  def extract_string(input_string):
+    start_string = "<trace>"
+    end_string = "</trace>"
+    if start_string not in input_string:
+      return ""
+    input_string = input_string[input_string.index(start_string) + len(start_string):]
+    if end_string not in input_string:
+      return ""
+    input_string = input_string[:input_string.index(end_string)]
+    parsed_list = []
+    for p in input_string.split(","):
+      p = p.strip()
+      try:
+        p = int(p)
+      except:
+        continue
+      parsed_list.append(p)
+    return parsed_list
+
+  # ================= generate the ground truth trace =====================
+
+  x = np.random.uniform(low=-100, high=100, size=num_points)
+  y = np.random.uniform(low=-100, high=100, size=num_points)
+  x = [np.round(xi, num_decimals) if num_decimals > 0 else int(xi) for xi in x]
+  y = [np.round(yi, num_decimals) if num_decimals > 0 else int(yi) for yi in y]
+
+  starting_algorithm = _START_ALGORITHM.value
+  
+  gt_sol, min_dis = solve_tsp(x, y, num_points, num_decimals, starting_algorithm)
+  print("ground truth solution" + str(gt_sol))
+  print("min distance: ", min_dis)
+  gt_sol_str = ",".join([str(i) for i in gt_sol])
+  point_list = range(num_points)
+  init_sols = []
+  while len(init_sols) < num_starting_points:
+    sol = np.random.permutation(point_list)
+    if sol[0] != 0:
+      continue
+    sol_str = ",".join([str(i) for i in sol])
+    if sol_str == gt_sol_str:
+      continue
+    init_sols.append(list(sol))
+
+  # ====================== run optimization ============================
+  configs_dict = {
+      "num_starting_points": num_starting_points,
+      "num_decode_per_step": num_decode_per_step,
+      "optimizer_llm_configs": optimizer_llm_dict,
+      "data": {
+          "ground truth solution": [",".join([str(i) for i in gt_sol])],
+          "loss_at_true_values": min_dis,
+          "x": list(x),
+          "y": list(y),
+      },
+      "init_sols": [",".join([str(i) for i in sol]) for sol in init_sols],
+      "num_steps": num_steps,
+      "max_num_pairs": max_num_pairs,
+      "num_decimals": num_decimals,
+  }
+  configs_json_path = os.path.join(save_folder, "configs.json")
+  print(f"saving configs to\n{configs_json_path}")
+  with open(configs_json_path, "w") as f:
+    json.dump(configs_dict, f, indent=4)
+
+  old_value_pairs_set = set()
+  old_value_pairs_with_i_step = []  # format: [(trace, dis = f(trace), i_step)]
+  meta_prompts_dict = dict()  # format: {i_step: meta_prompt}
+  raw_outputs_dict = dict()  # format: {i_step: raw_outputs}
+
+  for sol in init_sols:
+    dis = evaluate_distance(x, y, sol, num_decimals)
+    sol_str = ",".join([str(i) for i in sol])
+    old_value_pairs_set.add((sol_str, dis))
+    old_value_pairs_with_i_step.append((sol_str, dis, -1))
+
+  print("\n================ run optimization ==============")
+  print(f"initial points: {[tuple(item[:-1]) for item in old_value_pairs_set]}")
+  print(f"initial values: {[item[-1] for item in old_value_pairs_set]}")
+  results_json_path = os.path.join(save_folder, "results.json")
+  print(f"saving results to\n{results_json_path}")
+
+  for i_step in range(num_steps):
+    print(f"\nStep {i_step}:")
+    meta_prompt = gen_meta_prompt(
+        old_value_pairs_set,
+        x,
+        y,
+        max_num_pairs=max_num_pairs,
+    )
+    print("\n=================================================")
+    print(f"meta_prompt:\n{meta_prompt}")
+    meta_prompts_dict[i_step] = meta_prompt
+    raw_outputs = []
+    parsed_outputs = []
+    while len(parsed_outputs) < num_decode_per_step:
+      raw_output = call_optimizer_server_func(meta_prompt)
+      for string in raw_output:
+        print("\n=================================================")
+        print("raw output:\n", string)
+        try:
+          parsed_output = extract_string(string)
+          if parsed_output is not None and len(set(parsed_output)) == num_points and len(parsed_output) == num_points and parsed_output[0] == 0:
+            dis = evaluate_distance(x, y, parsed_output, num_decimals)
+            if dis == -1:
+              continue
+            parsed_outputs.append(parsed_output)
+            raw_outputs.append(string)
+        except:
+          pass
+    print("\n=================================================")
+    print(f"proposed points: {parsed_outputs}")
+    raw_outputs_dict[i_step] = raw_outputs
+
+    # evaluate the values of proposed and rounded outputs
+    single_step_values = []
+    for trace in parsed_outputs:
+      dis = evaluate_distance(x, y, trace, num_decimals)
+      single_step_values.append(dis)
+      trace_str = ",".join([str(i) for i in trace])
+      old_value_pairs_set.add((trace_str, dis))
+      old_value_pairs_with_i_step.append((trace_str, dis, i_step))
+    print(f"single_step_values: {single_step_values}")
+    print("ground truth solution" + str(gt_sol))
+    print("min distance: ", min_dis)
+
+    # ====================== save results ============================
+    results_dict = {
+        "meta_prompts": meta_prompts_dict,
+        "raw_outputs": raw_outputs_dict,
+        "old_value_pairs_with_i_step": old_value_pairs_with_i_step,
+    }
+    with open(results_json_path, "w") as f:
+      json.dump(results_dict, f, indent=4)
+
+
+if __name__ == "__main__":
+  app.run(main)
--- a/optimization/test.py
+++ b/optimization/test.py
@@ -0,0 +1,967 @@
+# Copyright 2023 The OPRO Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""The .py file for prompt optimization.
+
+Usage:
+
+Step 1: edit the starting instructions by modifying `initial_instructions`
+
+Step 2: edit the training ratio by modifying `train_ratio`
+
+Step 3: check if the model configs (like batch size) are the same as the actual serving configs
+
+Step 4: run
+
+```
+python optimize_instructions.py \
+    --optimizer="gpt-3.5-turbo" --scorer="text-bison" \
+    --instruction_pos="A_begin" --dataset="gsm8k" --task="train"
+```
+
+The outputs will then be written to `outputs/optimization-results/` in the opro folder.
+
+Notes:
+
+1. One or more API keys may need to be provided:
+- When using a Google-Cloud-served model (like text-bison at https://developers.generativeai.google/tutorials/text_quickstart), add `--palm_api_key=<your_key>`
+- When using an OpenAI model, add `--openai_api_key=”<your_key>”`
+
+2. The initial instructions should be provided in the "initial_instructions"
+variable.
+"""
+
+import datetime
+import functools
+import os
+import sys
+
+OPRO_ROOT_PATH = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+)
+sys.path.insert(0, OPRO_ROOT_PATH)
+
+from absl import app
+from absl import flags
+import google.generativeai as palm
+import numpy as np
+import openai
+from opro import prompt_utils
+from opro.optimization import opt_utils
+import pandas as pd
+
+ROOT_DATA_FOLDER_PATH = os.path.join(OPRO_ROOT_PATH, "data")
+
+_LOCAL_MODEL_PATH = flags.DEFINE_string("local_model_path", None, "Path to local vLLM model.")
+
+
+_OPENAI_API_KEY = flags.DEFINE_string(
+    "openai_api_key", "", "The OpenAI API key."
+)
+
+_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
+
+_SCORER = flags.DEFINE_string(
+    "scorer", "text-bison", "The name of the scorer LLM."
+)
+
+_OPTIMIZER = flags.DEFINE_string(
+    "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
+)
+
+_DATASET = flags.DEFINE_string(
+    "dataset", "gsm8k", "The name of dataset to search for instructions on."
+)
+
+_TASK = flags.DEFINE_string(
+    "task",
+    "train",
+    "The name of task within the above dataset to search for instructions on.",
+)
+
+_INSTRUCTION_POS = flags.DEFINE_string(
+    "instruction_pos",
+    "A_begin",
+    "The position of the instruction to search for.",
+)
+
+_META_PROMPT_TYPE = flags.DEFINE_string(
+    "meta_prompt_type",
+    "both_instructions_and_exemplars",
+    "The type of meta-prompt: whether to have both previous instructions and"
+    " dataset exemplars (often for fine-tuned optimizers), or to have only"
+    " previous instructions (often for pre-trained optimizers).",
+)
+
+
+def main(_):
+  openai_api_key = _OPENAI_API_KEY.value
+  palm_api_key = _PALM_API_KEY.value
+  scorer_llm_name = _SCORER.value
+  optimizer_llm_name = _OPTIMIZER.value
+  dataset_name = _DATASET.value.lower()
+  task_name = _TASK.value
+  meta_prompt_type = _META_PROMPT_TYPE.value
+  local_model_path = _LOCAL_MODEL_PATH.value
+
+  assert dataset_name in {
+      "mmlu",
+      "bbh",
+      "gsm8k",
+  }, "The lower-case dataset name must be one of mmlu, bbh, or gsm8k."
+  if dataset_name == "mmlu":
+    assert task_name in {
+        "STEM",
+        "humanities",
+        "social sciences",
+        "otheran (business, health, misc.)",
+    }  # for now only support searching on one MMLU category
+  elif dataset_name == "bbh":
+    assert task_name in {
+        "boolean_expressions",
+        "causal_judgement",
+        "date_understanding",
+        "disambiguation_qa",
+        "dyck_languages",
+        "formal_fallacies",
+        "geometric_shapes",
+        "hyperbaton",
+        "logical_deduction_five_objects",
+        "logical_deduction_seven_objects",
+        "logical_deduction_three_objects",
+        "movie_recommendation",
+        "multistep_arithmetic_two",
+        "navigate",
+        "object_counting",
+        "penguins_in_a_table",
+        "reasoning_about_colored_objects",
+        "ruin_names",
+        "salient_translation_error_detection",
+        "snarks",
+        "sports_understanding",
+        "temporal_sequences",
+        "tracking_shuffled_objects_five_objects",
+        "tracking_shuffled_objects_seven_objects",
+        "tracking_shuffled_objects_three_objects",
+        "web_of_lies",
+        "word_sorting",
+    }
+  else:
+    assert dataset_name == "gsm8k"
+    assert task_name in {"train", "test"}
+
+  assert scorer_llm_name in {
+      "text-bison",
+      "gpt-3.5-turbo",
+      "gpt-4",
+        "local",
+  }
+  assert optimizer_llm_name in {
+      "text-bison",
+      "gpt-3.5-turbo",
+      "gpt-4",
+        "local",
+  }
+  assert meta_prompt_type in {
+      "both_instructions_and_exemplars",
+      "instructions_only",
+  }
+
+  instruction_pos = _INSTRUCTION_POS.value
+  assert instruction_pos in {
+      "before_Q",
+      "Q_begin",
+      "Q_end",
+      "A_begin",
+  }, (
+      "The instruction position should be either before the question, or at the"
+      " beginning of the question, at the end of the question, or at the"
+      " beginning of the answer."
+  )
+  print(
+      f"scorer: {scorer_llm_name}, optimizer: {optimizer_llm_name}, dataset:"
+      f" {dataset_name}, task: {task_name}, instruction_pos: {instruction_pos}"
+  )
+
+  # make sure the scorer and optimizer models are callable
+
+  if scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
+    assert openai_api_key, "The OpenAI API key must be provided."
+    openai.api_key = openai_api_key
+  elif scorer_llm_name == "text-bison":
+    assert scorer_llm_name == "text-bison"
+    assert (
+        palm_api_key
+    ), "A PaLM API key is needed when prompting the text-bison model."
+    palm.configure(api_key=palm_api_key)
+  
+  elif scorer_llm_name == "local":
+    assert local_model_path, "The local model path must be provided."
+    assert os.path.exists(local_model_path), (
+        f"The local model path {local_model_path} does not exist."
+    )
+    # set the local model path for vLLM
+    # prompt_utils.call_local_server_func(local_model_path)
+  else:
+    raise ValueError(
+        f"Unknown scorer_llm_name: {scorer_llm_name}. "
+        "It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local."
+    )
+
+  if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
+    assert openai_api_key, "The OpenAI API key must be provided."
+    openai.api_key = openai_api_key
+  elif optimizer_llm_name == "text-bison":
+    assert optimizer_llm_name == "text-bison"
+    assert (
+        palm_api_key
+    ), "A PaLM API key is needed when prompting the text-bison model."
+    palm.configure(api_key=palm_api_key)
+  
+  elif optimizer_llm_name == "local":
+    assert local_model_path, "The local model path must be provided."
+    assert os.path.exists(local_model_path), (
+        f"The local model path {local_model_path} does not exist."
+    )
+    # set the local model path for vLLM
+    # prompt_utils.call_local_server_func(local_model_path)
+  else:
+    raise ValueError(
+        f"Unknown scorer_llm_name: {optimizer_llm_name}. "
+        "It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local."
+    )
+
+  
+  if dataset_name == "mmlu":
+    root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "MMLU-data")
+  elif dataset_name == "bbh":
+    root_data_folder_path = os.path.join(
+        ROOT_DATA_FOLDER_PATH, "BIG-Bench-Hard-data/"
+    )
+  else:
+    assert dataset_name == "gsm8k"
+    root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "gsm_data")
+
+  # =================== create the result directory ==========================
+  datetime_str = (
+      str(datetime.datetime.now().replace(microsecond=0))
+      .replace(" ", "-")
+      .replace(":", "-")
+  )
+
+  save_folder = os.path.join(
+      OPRO_ROOT_PATH,
+      "outputs",
+      "optimization-results",
+      f"{dataset_name.upper()}-{task_name}-s-{scorer_llm_name}-o-{optimizer_llm_name}-{datetime_str}/",
+  )
+  result_by_instruction_folder = os.path.join(
+      save_folder, "result_by_instruction"
+  )
+  os.makedirs(result_by_instruction_folder)
+  print(f"result directory:\n{save_folder}")
+
+  # ====================== scorer model configs ==============================
+  # difference between num_decodes and batch_size:
+  # - num_decodes: how many outputs we actually want for each input
+  # - batch_size: the batch size in model serving, should equal to that in
+  # model serving config
+  # 常量定义
+  DEFAULT_MAX_TOKENS = 1024
+  DEFAULT_TEMPERATURE = 0.0
+  PALM_MODEL_NAME = "text-bison-001"
+
+  if scorer_llm_name == "text-bison":
+    config = {
+        "temperature": DEFAULT_TEMPERATURE,
+        "max_decode_steps": DEFAULT_MAX_TOKENS,
+        "batch_size": 1,
+        "num_servers": 1,
+    }
+    call_scorer_server_func = functools.partial(
+        prompt_utils.call_palm_server_from_cloud,
+        model=PALM_MODEL_NAME,
+        **config
+    )
+    scorer_llm_dict = {"model_type": "text-bison", **config}
+
+  elif scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
+    config = {
+        "temperature": DEFAULT_TEMPERATURE,
+        "max_decode_steps": DEFAULT_MAX_TOKENS,
+        "batch_size": 1,
+        "num_servers": 1,
+    }
+    call_scorer_server_func = functools.partial(
+        prompt_utils.call_openai_server_func,
+        model=scorer_llm_name.lower(),
+        **config
+    )
+    scorer_llm_dict = {"model_type": scorer_llm_name.lower(), **config}
+
+  elif scorer_llm_name == "local":
+    print(f"[DEBUG] local_model_path: {local_model_path}")
+    assert local_model_path, "Local model path must be provided."
+    config = {
+        "temperature": DEFAULT_TEMPERATURE,
+        "max_decode_steps": DEFAULT_MAX_TOKENS,
+        "batch_size": 8,
+        "num_servers": 8,# number of servers to use for local model
+    }
+    call_scorer_server_func = functools.partial(
+        prompt_utils.call_local_server_func,
+        local_model_path=local_model_path,
+        **config
+    )
+    scorer_llm_dict = {"model_type": "local", **config}
+
+  else:
+    raise ValueError(f"Unsupported model: {scorer_llm_name}")
+
+#   if scorer_llm_name == "text-bison":
+#     # when prompting text-bison with Cloud API
+#     scorer_finetuned_palm_temperature = 0.0
+#     scorer_finetuned_palm_max_decode_steps = 1024
+#     scorer_finetuned_palm_batch_size = 1
+#     scorer_finetuned_palm_num_servers = 1
+#     scorer_finetuned_palm_dict = dict()
+#     scorer_finetuned_palm_dict["temperature"] = (
+#         scorer_finetuned_palm_temperature
+#     )
+#     scorer_finetuned_palm_dict["num_servers"] = (
+#         scorer_finetuned_palm_num_servers
+#     )
+#     scorer_finetuned_palm_dict["batch_size"] = scorer_finetuned_palm_batch_size
+#     scorer_finetuned_palm_dict["max_decode_steps"] = (
+#         scorer_finetuned_palm_max_decode_steps
+#     )
+
+#     call_scorer_finetuned_palm_server_func = functools.partial(
+#         prompt_utils.call_palm_server_from_cloud,
+#         model="text-bison-001",
+#         temperature=scorer_finetuned_palm_dict["temperature"],
+#         max_decode_steps=scorer_finetuned_palm_dict["max_decode_steps"],
+#     )
+
+#     scorer_llm_dict = {
+#         "model_type": scorer_llm_name.lower(),
+#     }
+#     scorer_llm_dict.update(scorer_finetuned_palm_dict)
+#     call_scorer_server_func = call_scorer_finetuned_palm_server_func
+
+#   elif scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
+#     # assert scorer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4"}
+#     scorer_gpt_max_decode_steps = 1024
+#     scorer_gpt_temperature = 0.0
+
+#     scorer_gpt_dict = dict()
+#     scorer_gpt_dict["max_decode_steps"] = scorer_gpt_max_decode_steps
+#     scorer_gpt_dict["temperature"] = scorer_gpt_temperature
+#     scorer_gpt_dict["num_decodes"] = 1
+#     scorer_gpt_dict["batch_size"] = 1
+#     scorer_gpt_dict["num_servers"] = 1
+
+#     scorer_llm_dict = {
+#         "model_type": scorer_llm_name.lower(),
+#     }
+#     scorer_llm_dict.update(scorer_gpt_dict)
+#     call_scorer_server_func = functools.partial(
+#         prompt_utils.call_openai_server_func,
+#         model=scorer_llm_name.lower(),
+#         max_decode_steps=scorer_gpt_max_decode_steps,
+#         temperature=scorer_gpt_temperature,
+#     )
+#     elif scorer_llm_name == "local":
+#     # local vLLM model
+#     scorer_local_max_decode_steps = 1024
+#     scorer_local_temperature = 0.0
+#     call_scorer_server_func = functools.partial(
+#         prompt_utils.call_local_model_server_func,
+#         model_path=local_model_path,
+#         max_decode_steps=scorer_local_max_decode_steps,
+#         temperature=scorer_local_temperature,
+#     )
+#     else:
+#         raise ValueError(
+#         f"Unknown scorer_llm_name: {scorer_llm_name}. "
+#         "It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local."
+#     )
+
+  # ====================== optimizer model configs ============================
+  if optimizer_llm_name.lower() == "text-bison":
+    # PaLM text-bison 模型配置
+    optimizer_llm_dict = {
+        "model_type": "text-bison",
+        "temperature": 1.0,          # 更高的随机性以生成多样化解
+        "max_decode_steps": 1024,    # 最大生成长度
+        "batch_size": 1,             # 单样本处理
+        "num_decodes": 8,            # 生成8个候选结果
+        "num_servers": 1             # 单服务器
+    }
+    
+    call_optimizer_server_func = functools.partial(
+        prompt_utils.call_palm_server_from_cloud,
+        model="text-bison-001",
+        temperature=optimizer_llm_dict["temperature"],
+        max_decode_steps=optimizer_llm_dict["max_decode_steps"],
+    )
+
+  elif optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4"}:
+    # GPT 模型配置
+    optimizer_llm_dict = {
+        "model_type": optimizer_llm_name.lower(),
+        "temperature": 1.0,          # 更高的随机性
+        "max_decode_steps": 512,      # 较短的最大长度
+        "batch_size": 1,
+        "num_decodes": 1 ,             # 单次生成
+         "num_servers": 1             # 单服务器
+    }
+    
+    call_optimizer_server_func = functools.partial(
+        prompt_utils.call_openai_server_func,
+        model=optimizer_llm_name,
+        max_decode_steps=optimizer_llm_dict["max_decode_steps"],
+        temperature=optimizer_llm_dict["temperature"],
+    )
+  elif optimizer_llm_name.lower() == "local":
+    assert local_model_path, "Local model path must be provided."
+    optimizer_llm_dict = {
+        "model_type": optimizer_llm_name.lower(),
+        "temperature": 1.0,          # 更高的随机性
+        "max_decode_steps": 512,      # 较短的最大长度
+        "batch_size": 8,
+        "num_decodes": 1 ,             # 单次生成
+        "num_servers": 8             # 单服务器
+    }
+    call_optimizer_server_func = functools.partial(
+        prompt_utils.call_local_server_func,
+        local_model_path=local_model_path,
+        max_decode_steps=optimizer_llm_dict["max_decode_steps"],
+        temperature=optimizer_llm_dict["temperature"],
+    )
+
+  else:
+    raise ValueError(
+        f"Unsupported optimizer model: {optimizer_llm_name}. "
+        "Must be one of: text-bison, gpt-3.5-turbo, gpt-4"
+    )
+
+#   if optimizer_llm_name.lower() == "text-bison":
+#     # when prompting text-bison with Cloud API
+#     optimizer_finetuned_palm_temperature = 1.0
+#     optimizer_finetuned_palm_num_decodes = 8
+#     optimizer_finetuned_palm_max_decode_steps = 1024
+#     optimizer_finetuned_palm_batch_size = 1
+#     optimizer_finetuned_palm_num_servers = 1
+#     optimizer_finetuned_palm_dict = dict()
+#     optimizer_finetuned_palm_dict["temperature"] = (
+#         optimizer_finetuned_palm_temperature
+#     )
+#     optimizer_finetuned_palm_dict["num_decodes"] = (
+#         optimizer_finetuned_palm_num_decodes
+#     )
+#     optimizer_finetuned_palm_dict["batch_size"] = (
+#         optimizer_finetuned_palm_batch_size
+#     )
+#     optimizer_finetuned_palm_dict["num_servers"] = (
+#         optimizer_finetuned_palm_num_servers
+#     )
+#     optimizer_finetuned_palm_dict["max_decode_steps"] = (
+#         optimizer_finetuned_palm_max_decode_steps
+#     )
+
+#     call_optimizer_finetuned_palm_server_func = functools.partial(
+#         prompt_utils.call_palm_server_from_cloud,
+#         model="text-bison-001",
+#         temperature=optimizer_finetuned_palm_dict["temperature"],
+#         max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
+#     )
+
+#     optimizer_llm_dict = {
+#         "model_type": optimizer_llm_name.lower(),
+#     }
+#     optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
+#     call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
+
+#   else:
+#     assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
+#     optimizer_gpt_max_decode_steps = 512
+#     optimizer_gpt_temperature = 1.0
+
+#     optimizer_llm_dict = dict()
+#     optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps
+#     optimizer_llm_dict["temperature"] = optimizer_gpt_temperature
+#     optimizer_llm_dict["batch_size"] = 1
+#     optimizer_llm_dict["num_decodes"] = 1
+#     call_optimizer_server_func = functools.partial(
+#         prompt_utils.call_openai_server_func,
+#         model=optimizer_llm_name,
+#         max_decode_steps=optimizer_gpt_max_decode_steps,
+#         temperature=optimizer_gpt_temperature,
+#     )
+
+  # ====================== try calling the servers ============================
+  print("\n======== testing the scorer and optimizer servers ===========")
+  scorer_test_output = call_scorer_server_func(
+      "Does the sun rise from the north? Just answer yes or no."
+  )
+  print(f"number of scorer output decodes: {len(scorer_test_output)}")
+  print(f"scorer test output: {scorer_test_output}")
+  optimizer_test_output = call_optimizer_server_func(
+      "Does the sun rise from the north? Just answer yes or no.",
+      temperature=1.0,
+  )
+  print(f"number of optimizer output decodes: {len(optimizer_test_output)}")
+  print(f"optimizer test output: {optimizer_test_output}")
+  print("Finished testing the servers.")
+
+  # ====================== read data ============================
+  print("\n================ prompt optimization settings ==============")
+  # from https://github.com/hendrycks/test/blob/master/categories.py
+  subcategories = {
+      "abstract_algebra": ["math"],
+      "anatomy": ["health"],
+      "astronomy": ["physics"],
+      "business_ethics": ["business"],
+      "clinical_knowledge": ["health"],
+      "college_biology": ["biology"],
+      "college_chemistry": ["chemistry"],
+      "college_computer_science": ["computer science"],
+      "college_mathematics": ["math"],
+      "college_medicine": ["health"],
+      "college_physics": ["physics"],
+      "computer_security": ["computer science"],
+      "conceptual_physics": ["physics"],
+      "econometrics": ["economics"],
+      "electrical_engineering": ["engineering"],
+      "elementary_mathematics": ["math"],
+      "formal_logic": ["philosophy"],
+      "global_facts": ["other"],
+      "high_school_biology": ["biology"],
+      "high_school_chemistry": ["chemistry"],
+      "high_school_computer_science": ["computer science"],
+      "high_school_european_history": ["history"],
+      "high_school_geography": ["geography"],
+      "high_school_government_and_politics": ["politics"],
+      "high_school_macroeconomics": ["economics"],
+      "high_school_mathematics": ["math"],
+      "high_school_microeconomics": ["economics"],
+      "high_school_physics": ["physics"],
+      "high_school_psychology": ["psychology"],
+      "high_school_statistics": ["math"],
+      "high_school_us_history": ["history"],
+      "high_school_world_history": ["history"],
+      "human_aging": ["health"],
+      "human_sexuality": ["culture"],
+      "international_law": ["law"],
+      "jurisprudence": ["law"],
+      "logical_fallacies": ["philosophy"],
+      "machine_learning": ["computer science"],
+      "management": ["business"],
+      "marketing": ["business"],
+      "medical_genetics": ["health"],
+      "miscellaneous": ["other"],
+      "moral_disputes": ["philosophy"],
+      "moral_scenarios": ["philosophy"],
+      "nutrition": ["health"],
+      "philosophy": ["philosophy"],
+      "prehistory": ["history"],
+      "professional_accounting": ["other"],
+      "professional_law": ["law"],
+      "professional_medicine": ["health"],
+      "professional_psychology": ["psychology"],
+      "public_relations": ["politics"],
+      "security_studies": ["politics"],
+      "sociology": ["culture"],
+      "us_foreign_policy": ["politics"],
+      "virology": ["health"],
+      "world_religions": ["philosophy"],
+  }
+
+  categories = {
+      "STEM": [
+          "physics",
+          "chemistry",
+          "biology",
+          "computer science",
+          "math",
+          "engineering",
+      ],
+      "humanities": ["history", "philosophy", "law"],
+      "social sciences": [
+          "politics",
+          "culture",
+          "economics",
+          "geography",
+          "psychology",
+      ],
+      "other (business, health, misc.)": ["other", "business", "health"],
+  }
+
+  if dataset_name == "mmlu":
+    # EITHER: filter by category
+    # category_names = [
+    #     "STEM",
+    #     "humanities",
+    #     "social sciences",
+    #     "other (business, health, misc.)",
+    # ]
+    category_names = [task_name]
+    folder_name = "test"  # one of {'auxiliary_train', 'dev', 'val', 'test'}
+    task_names = []
+    for task_csv_name in os.listdir(
+        os.path.join(root_data_folder_path, folder_name)
+    ):
+      task_names.append(task_csv_name.split(".")[0])
+
+    tasks_in_category = []
+    for category_name in category_names:
+      for task_name in task_names:
+        for subname in subcategories:
+          if subname in task_name:
+            if subcategories[subname][0] in categories[category_name]:
+              tasks_in_category.append(task_name)
+              break
+
+    tasks_all = [(folder_name, task_name) for task_name in tasks_in_category]
+    multiple_choice_tasks = set([item[1] for item in tasks_all])
+    boolean_tasks = set()
+    numerical_output_tasks = set()
+
+    # OR: filter by task
+    # tasks_all = [
+    #     # ('test', 'abstract_algebra_test'),
+    #     # ('test', 'college_computer_science_test'),
+    #     # ('test', 'college_mathematics_test'),
+    #     # ('test', 'college_physics_test'),
+    #     # ('test', 'elementary_mathematics_test'),
+    #     # ('test', 'global_facts_test'),
+    #     # ('test', 'high_school_physics_test'),
+    #     # ('test', 'machine_learning_test'),
+    #     # ('test', 'management_test'),
+    #     # ('test', 'medical_genetics_test'),
+    #     # ('test', 'moral_scenarios_test'),
+    #     # ('test', 'professional_psychology_test'),
+    #     # ('test', 'public_relations_test'),
+    #     # ('test', 'professional_law_test'),
+    #     # ('test', 'high_school_psychology_test'),
+    #     # ('test', 'high_school_world_history_test'),
+    #     # ('test', 'human_aging_test'),
+    #     # ('test', 'miscellaneous_test'),
+    #     # ('test', 'moral_scenarios_test'),
+    #     ('test', 'professional_psychology_test'),
+    #     # ('test', 'security_studies_test'),
+    # ]
+
+  elif dataset_name == "bbh":
+    tasks_all = [task_name]
+    assert (
+        len(tasks_all) == 1
+    ), "for now only support prompt optimization on one BBH task"
+
+    # all BBH tasks are as below
+    # tasks_all = [
+    #     'boolean_expressions',
+    #     'causal_judgement',
+    #     'date_understanding',
+    #     'disambiguation_qa',
+    #     'dyck_languages',
+    #     'formal_fallacies',
+    #     'geometric_shapes',
+    #     'hyperbaton',
+    #     'logical_deduction_five_objects',
+    #     'logical_deduction_seven_objects',
+    #     'logical_deduction_three_objects',
+    #     'movie_recommendation',
+    #     'multistep_arithmetic_two',
+    #     'navigate',
+    #     'object_counting',
+    #     'penguins_in_a_table',
+    #     'reasoning_about_colored_objects',
+    #     'ruin_names',
+    #     'salient_translation_error_detection',
+    #     'snarks',
+    #     'sports_understanding',
+    #     'temporal_sequences',
+    #     'tracking_shuffled_objects_five_objects',
+    #     'tracking_shuffled_objects_seven_objects',
+    #     'tracking_shuffled_objects_three_objects',
+    #     'web_of_lies',
+    #     'word_sorting'
+    # ]
+    numerical_output_tasks = {
+        "object_counting",
+        "multistep_arithmetic_two",
+    }
+
+    multiple_choice_tasks = {
+        "date_understanding",
+        "disambiguation_qa",
+        "geometric_shapes",
+        "hyperbaton",
+        "logical_deduction_five_objects",
+        "logical_deduction_seven_objects",
+        "logical_deduction_three_objects",
+        "movie_recommendation",
+        "penguins_in_a_table",
+        "reasoning_about_colored_objects",
+        "ruin_names",
+        "salient_translation_error_detection",
+        "snarks",
+        "temporal_sequences",
+        "tracking_shuffled_objects_five_objects",
+        "tracking_shuffled_objects_seven_objects",
+        "tracking_shuffled_objects_three_objects",
+    }
+
+    boolean_tasks = {
+        "boolean_expressions",  # True or False
+        "causal_judgement",  # yes or no
+        "formal_fallacies",  # valid or invalid
+        "navigate",  # yes or no
+        "sports_understanding",  # yes or no
+        "web_of_lies",  # yes or no
+    }
+
+  else:
+    assert dataset_name in {"gsm8k"}
+    tasks_all = [task_name]
+    multiple_choice_tasks = set()
+    boolean_tasks = set()
+    numerical_output_tasks = set(tasks_all)
+
+  if dataset_name == "mmlu":
+    raw_data = pd.DataFrame()
+    prediction_treat_as_number = False
+    prediction_treat_as_bool = False
+  elif dataset_name == "bbh":
+    raw_data = []
+    prediction_treat_as_number = bool(
+        tasks_all[0] in numerical_output_tasks
+    )  # for now only check the first task
+    prediction_treat_as_bool = bool(
+        tasks_all[0] in boolean_tasks
+    )  # for now only check the first task
+    print(
+        f"prediction_treat_as_number: {prediction_treat_as_number},"
+        f" prediction_treat_as_bool: {prediction_treat_as_bool}"
+    )
+  else:
+    assert dataset_name == "gsm8k"
+    raw_data = pd.DataFrame()
+    prediction_treat_as_number = True
+    prediction_treat_as_bool = False
+
+  for t in tasks_all:
+    if dataset_name == "mmlu":
+      folder_name = t[0]
+      task_name = t[1]
+      single_task_df = pd.read_csv(
+          os.path.join(root_data_folder_path, f"{folder_name}/{task_name}.csv"),
+          index_col=None,
+          header=None,
+      )
+      raw_data = pd.concat([raw_data, single_task_df])
+    elif dataset_name == "bbh":
+      task_name = t
+      single_task_list = opt_utils.load_bbh_task_data(
+          task_name, base_dir=root_data_folder_path
+      )
+      raw_data += single_task_list
+    else:
+      assert dataset_name == "gsm8k"
+      task_name = t
+      f_gsm = os.path.join(root_data_folder_path, f"gsm_{task_name}.tsv")
+      single_task_df = pd.read_csv(f_gsm, sep="\t", header=None)
+      raw_data = pd.concat([raw_data, single_task_df])
+
+  if dataset_name == "mmlu":
+    num_examples = raw_data.shape[0]
+  elif dataset_name == "bbh":
+    num_examples = len(raw_data)
+  else:
+    assert dataset_name in {"gsm8k"}
+    num_examples = raw_data.shape[0]
+  print(f"number of examples in the current task: {num_examples}")
+
+  # ================ split data into train/val/test ==========================
+  if dataset_name == "mmlu":
+    train_ratio = 0.8
+    eval_ratio = 0.2
+  elif dataset_name == "gsm8k":
+    train_ratio = 0.035
+    eval_ratio = 0
+  else:
+    assert dataset_name == "bbh"
+    train_ratio = 0.2
+    eval_ratio = 0
+
+  # train-validation-test split
+  # It is important to sort the indices, as this ensures the is_multiple_choice
+  # Boolean variables match the data points.
+  assert train_ratio + eval_ratio <= 1
+  test_ratio = 1 - train_ratio - eval_ratio
+  print(
+      f"train_ratio: {train_ratio}, eval_ratio: {eval_ratio}, "
+      f"test_ratio: {test_ratio}"
+  )
+  np.random.seed(0)
+  train_index = np.sort(
+      np.array(
+          np.random.choice(
+              num_examples, size=int(train_ratio * num_examples), replace=False
+          )
+      )
+  )
+  eval_and_test_index = np.sort(
+      np.array(list(set(np.arange(num_examples)) - set(train_index)))
+  )
+  eval_index = np.sort(
+      np.array(
+          np.random.choice(
+              eval_and_test_index,
+              size=int(eval_ratio * num_examples),
+              replace=False,
+          )
+      )
+  )
+
+  # ========== set other optimization experiment hyperparameters ==============
+  if scorer_llm_name == "text-bison":
+    old_instruction_score_threshold = 0.0 # 完全保留旧指令 表示不过滤任何历史指令（即使质量很低的旧指令也会保留）。
+    # old_instruction_score_threshold = 0.15  # for GSM8K
+  elif scorer_llm_name == "local":
+    old_instruction_score_threshold = 0.3
+  else:
+    assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"} # 模型校验
+    old_instruction_score_threshold = 0.3 # 过滤低质量旧指令
+
+  if scorer_llm_name == "text-bison":
+    extract_final_answer_by_prompting_again = False # 是否通过二次提示提取最终答案（例如从冗长响应中提取关键内容）
+    include_qa = False # 是否在元提示中包含问答对
+    evaluate_in_parallel = False # 是否并行评估
+  elif scorer_llm_name == "local":
+    extract_final_answer_by_prompting_again = True
+    include_qa = True
+    evaluate_in_parallel = True
+  else:
+    assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
+    extract_final_answer_by_prompting_again = False
+    include_qa = False
+    evaluate_in_parallel = False
+
+  optimizer_llm_temperature = optimizer_llm_dict["temperature"]
+
+  num_few_shot_questions_for_instruction_refinement = 3 # number of few-shot questions 每次优化指令时参考的少样本示例数量（Few-shot QA对）。
+
+  # To change the number of generated instructions in each step, one should
+  # edit the value of the variable below, instead of editing the number of
+  # decodes in model parameters, because those values are limited by model
+  # serving configs.
+  num_generated_instructions_in_each_step = 3 # number of generated instructions in each step 每轮搜索生成的候选指令数量。
+  num_search_steps = 50 # number of search steps 总优化迭代次数。
+
+  initial_instructions = [
+      "Let's solve the problem.",
+      # "",
+      # "The answer is",
+  ]
+  few_shot_qa_pairs = True #是否使用少样本示例指导指令生成。
+  # one of {'accumulative_most_frequent', 'current_most_frequent', 'random',
+  # 'constant'}
+  few_shot_selection_criteria = "random" #对多样性要求高时用 random，稳定性要求高时用 most_frequent。
+  # whether to evaluate generated instructions on the exemplars in meta-prompt
+  evaluate_generated_ins_on_few_shot = False # 是否评估新指令 开发阶段设为 True调试指令质量。
+  # whether to evaluate old instructions on the exemplars in the meta-prompt
+  evaluate_old_ins_on_few_shot = False # 是否评估旧指令  生产阶段设为 False加速运行。
+  # every this number of steps, compute the accuracies of current-step
+  # instructions on the validation set
+  eval_interval = 3 # 每N步在验证集上测试当前指令的准确率。
+
+  max_num_instructions = (
+      20  # 元提示中保留的历史指令数量上限。
+  )
+  # 将连续分数离散化为N档（如0-100整数），简化模型理解。
+  num_score_buckets = 100
+  # whether to put old instructions and scores to before exemplars in
+  # 控制元提示中历史指令和少样本示例的顺序。
+  meta_prompt_instructions_before_exemplars = True
+
+  # ===================== run prompt optimization ======================
+
+  assert few_shot_selection_criteria in {
+      "accumulative_most_frequent",
+      "current_most_frequent",
+      "random",
+      "constant",
+  }
+  evolution_kwargs = {
+      "num_search_steps": num_search_steps,
+      "old_instruction_score_threshold": old_instruction_score_threshold,
+      "scorer_llm_dict": scorer_llm_dict,
+      "optimizer_llm_dict": optimizer_llm_dict,
+      "extract_final_answer_by_prompting_again": (
+          extract_final_answer_by_prompting_again
+      ),
+      "include_qa": include_qa,
+      "evaluate_in_parallel": evaluate_in_parallel,
+      "tasks_all": tasks_all,
+      "train_ratio": train_ratio,
+      "eval_ratio": eval_ratio,
+      "test_ratio": test_ratio,
+      "train_index": train_index,
+      "eval_index": eval_index,
+      "dataset_name": dataset_name,
+      "task_name": task_name,
+      "num_examples": num_examples,
+      "root_data_folder_path": root_data_folder_path,
+      "optimizer_llm_temperature": optimizer_llm_temperature,
+      # "optimizer_llm_temperature_schedule": (
+      #     optimizer_llm_temperature_schedule
+      # ),
+      # "optimizer_llm_temperature_end": optimizer_llm_temperature_end,
+      "initial_instructions": initial_instructions,
+      "multiple_choice_tasks": multiple_choice_tasks,
+      "raw_data": raw_data,
+      "call_scorer_server_func": call_scorer_server_func,
+      "call_optimizer_server_func": call_optimizer_server_func,
+      "instruction_pos": instruction_pos,
+      "prediction_treat_as_number": prediction_treat_as_number,
+      "prediction_treat_as_bool": prediction_treat_as_bool,
+      "result_by_instruction_folder": result_by_instruction_folder,
+      "few_shot_qa_pairs": few_shot_qa_pairs,
+      "num_score_buckets": num_score_buckets,
+      "max_num_instructions": max_num_instructions,
+      "meta_prompt_type": meta_prompt_type,
+      "meta_prompt_instructions_before_exemplars": (
+          meta_prompt_instructions_before_exemplars
+      ),
+      "few_shot_selection_criteria": few_shot_selection_criteria,
+      "optimizer_llm_name": optimizer_llm_name,
+      "num_generated_instructions_in_each_step": (
+          num_generated_instructions_in_each_step
+      ),
+      "evaluate_generated_ins_on_few_shot": evaluate_generated_ins_on_few_shot,
+      "num_few_shot_questions_for_instruction_refinement": (
+          num_few_shot_questions_for_instruction_refinement
+      ),
+      "evaluate_old_ins_on_few_shot": evaluate_old_ins_on_few_shot,
+      "eval_interval": eval_interval,
+      "save_folder": save_folder,
+  }
+
+  opt_utils.run_evolution(**evolution_kwargs)
+
+
+if __name__ == "__main__":
+  app.run(main)