diff --git a/opro_qwen_xinference_demo_package.zip b/opro_qwen_xinference_demo_package.zip
deleted file mode 100644
index 187905f..0000000
Binary files a/opro_qwen_xinference_demo_package.zip and /dev/null differ
diff --git a/optimization/__pycache__/opt_utils.cpython-310.pyc b/optimization/__pycache__/opt_utils.cpython-310.pyc
deleted file mode 100644
index 2c7a4a9..0000000
Binary files a/optimization/__pycache__/opt_utils.cpython-310.pyc and /dev/null differ
diff --git a/optimization/opt_utils.py b/optimization/opt_utils.py
deleted file mode 100644
index 36ad3bf..0000000
--- a/optimization/opt_utils.py
+++ /dev/null
@@ -1,1035 +0,0 @@
-# Copyright 2023 The OPRO Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""The utility functions for prompt optimization."""
-
-import collections
-import json
-import os
-import pickle
-import re
-import sys
-
-OPRO_ROOT_PATH = os.path.dirname(
-    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-)
-sys.path.insert(0, OPRO_ROOT_PATH)
-
-import numpy as np
-from opro.evaluation import eval_utils
-import pandas as pd
-
-
-def extract_string_in_square_brackets(input_string):
-  raw_result = re.findall(r"\[.*?\]", input_string)
-  if raw_result:
-    return raw_result[0][1:-1]
-  else:
-    return "" #从输入的字符串 input_string中提取​​第一个被方括号 []包裹的内容​​，并返回去掉方括号后的结果。如果输入字符串中没有方括号，则返回空字符串 ""。
-
-
-def parse_tag_content(text, prefix="<TEXT>", suffix="</TEXT>"):
-  pattern = f"{prefix}(.*?){suffix}"
-  results = re.findall(pattern, text, re.DOTALL)
-  return results
-
-
-def _bucketize_float(num, n_buckets=20):
-  assert num >= 0 and num <= 1, "The given number must be between 0 and 1."
-  return round(num * n_buckets) #此函数可能用于将​​指令的评分​​（如 0.85）转换为整数分桶
-
-
-def gen_ins_and_score_pairs_substr(
-    old_instructions_and_scores,
-    old_instruction_score_threshold=0.1,
-    max_num_instructions=1000,
-    return_str_only=False,
-    num_score_buckets=np.inf,
-):
-  """Generate the string that includes instruction-score pairs."""
-  assert num_score_buckets == np.inf or isinstance(num_score_buckets, int)
-  old_instructions_and_scores_str = ""
-  old_instructions_and_scores = sorted(
-      old_instructions_and_scores, key=lambda x: x[1]
-  )[-max_num_instructions:]
-  old_instructions_and_scores_in_meta_prompt = []
-  for instruction, score, i_step in old_instructions_and_scores:
-    if (
-        not old_instruction_score_threshold
-        or score >= old_instruction_score_threshold
-    ):
-      old_instructions_and_scores_in_meta_prompt.append(
-          (instruction, score, i_step)
-      )
-      if num_score_buckets == np.inf:
-        score_to_show = round(score, 3)
-      else:
-        score_to_show = _bucketize_float(score, num_score_buckets)
-      old_instructions_and_scores_str += (
-          f"\ntext:\n{instruction}\nscore:\n{score_to_show}\n"
-      )
-  if return_str_only:
-    return old_instructions_and_scores_str
-  else:
-    return (
-        old_instructions_and_scores_str,
-        old_instructions_and_scores_in_meta_prompt,
-    )
-
-
-def gen_meta_prompt(
-    old_instructions_and_scores,
-    instruction_pos,
-    optimizer_llm_name,
-    old_instruction_score_threshold=0.1,
-    max_num_instructions=1000,
-    meta_prompt_type="both_instructions_and_exemplars",
-    few_shot_qa_pairs=False,
-    include_qa=True,
-    data=None,
-    few_shot_index_list=None,
-    instructions_before_exemplars=True,
-    num_score_buckets=np.inf,
-    dataset_name="",
-    task_name="",
-):
-  """Generate meta prompt for instruction rewriting.
-
-  Args:
-   old_instructions_and_scores (list): a list of (instruction, score, i_step)
-     pairs.
-   instruction_pos (str): where to put the instruction, one of {'before_QA',
-     'Q_begin', 'Q_end', 'A_begin'}.
-   optimizer_llm_name (str): the name of the LLM used for instruction editing.
-   old_instruction_score_threshold (float): only add old instructions with score
-     no less than this threshold.
-   max_num_instructions (int): the maximum number of instructions in the meta
-     prompt.
-   meta_prompt_type (str): the type of meta-prompt: whether to have both
-     previous instructions and dataset exemplars (often for fine-tuned
-     optimizers), or to have only previous instructions (often for pre-trained
-     optimizers).
-   few_shot_qa_pairs (bool): whether to have few-shot QA pairs in the meta
-     prompt.
-   include_qa (bool): whether to include "Q:" and "A:" formats in the prompt.
-   data (list or pd.DataFrame): the raw data.
-   few_shot_index_list (list): the list of indices of few-shot examples.
-   instructions_before_exemplars (bool): whether the instruction-score pairs are
-     before the exemplars from the dataset.
-   num_score_buckets (np.inf or int): the number of score buckets when we
-     convert float accuracies to integers. Default to np.inf for not
-     bucketizing.
-   dataset_name (str): the name of the current dataset. Only used when
-     generating task description when meta_prompt_type == "instructions_only".
-   task_name (str): the name of the current task. Only used when generating task
-     description when meta_prompt_type == "instructions_only".
-
-  Returns:
-   meta_prompt (str): the generated meta prompt.
-  """
-  assert instruction_pos in {
-      "before_Q",
-      "Q_begin",
-      "Q_end",
-      "A_begin",
-  }, (
-      "The instruction position should be either before the question, or at the"
-      " beginning of the question, at the end of the question, or at the"
-      " beginning of the answer."
-  )
-  assert meta_prompt_type in {
-      "both_instructions_and_exemplars",
-      "instructions_only",
-  }
-  assert dataset_name in {
-      "mmlu",
-      "bbh",
-      "gsm8k",
-  }, "The lower-case dataset name must be one of mmlu, bbh, gsm8k."
-  assert num_score_buckets == np.inf or isinstance(num_score_buckets, int)
-
-  meta_prompt = ""
-  if meta_prompt_type == "both_instructions_and_exemplars":
-    if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4","local"}: # GPT模型的提示模板
-      if instruction_pos == "A_begin":# 针对答案开头部分的描述 
-        meta_prompt_old_instruction_part = (
-            "Your task is to generate the answer starting sentence <Start>."
-            " Below are some previous starting sentences with their scores."
-            " The score ranges from 0 to 100.\n"
-        )#要求模型生成答案的起始句（如“The answer is...”），并参考历史评分数据。
-      else:# 针对普通指令的描述
-        meta_prompt_old_instruction_part = (
-            "Your task is to generate the instruction <INS>."
-            " Below are some previous instructions with their scores."
-            " The score ranges from 0 to 100.\n"
-        )
-    else: # text-bison 模型的专用提示模板
-      assert optimizer_llm_name.lower() == "text-bison"
-      meta_prompt_old_instruction_part = (
-          "I have some texts along with their corresponding scores."
-          " The texts are arranged in ascending order based on their scores,"
-          " where higher scores indicate better quality.\n\n"
-      )
-    # add old instructions
-    old_instructions_and_scores_str = gen_ins_and_score_pairs_substr(
-        old_instructions_and_scores=old_instructions_and_scores,
-        old_instruction_score_threshold=old_instruction_score_threshold,
-        max_num_instructions=max_num_instructions,
-        return_str_only=True,
-        num_score_buckets=num_score_buckets,
-    )
-    meta_prompt_old_instruction_part += old_instructions_and_scores_str
-    # add QA pairs if few_shot_qa_pairs == True
-    meta_prompt_exemplar_part = ""
-    if few_shot_qa_pairs:
-      if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
-        meta_prompt_exemplar_part += "Below are some problems.\n"
-      else:
-        assert optimizer_llm_name.lower() == "text-bison"
-        meta_prompt_exemplar_part += (
-            "The following exemplars show how to apply your text: you replace"
-            " <INS> in each input with your text, then read the input and give"
-            " an output. We say your output is wrong if your output is"
-            " different from the given output, and we say your output is"
-            " correct if they are the same. When replacing <INS> with an old"
-            " piece of text above, we get wrong outputs on the following"
-            " inputs.\n\n"
-        )
-      for idx in few_shot_index_list:
-        if dataset_name == "mmlu":
-          question = eval_utils._format_mmlu_example(data, idx)  # pylint: disable=protected-access
-          true_answer = data.iloc[idx, -1]
-        elif dataset_name == "bbh":
-          question = data[idx]["input"]
-          true_answer = data[idx]["target"]
-        else:
-          assert dataset_name == "gsm8k"
-          question = data.iloc[idx, 0]
-          true_answer = data.iloc[idx, 1]
-
-        if include_qa:  # when "Q:" and "A:" are present in the prompt
-          if instruction_pos == "before_Q":
-            meta_prompt_exemplar_part += f"\ninput:\n<INS>\nQ: {question}\nA:"
-          elif instruction_pos == "Q_begin":
-            meta_prompt_exemplar_part += f"\ninput:\nQ: <INS>\n{question}\nA:"
-          elif instruction_pos == "Q_end":
-            meta_prompt_exemplar_part += f"\ninput:\nQ: {question}\n<INS>\nA:"
-          else:  # instruction_pos == "A_begin"
-            if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
-              meta_prompt_exemplar_part += f"\nQ: {question}\nA: <Start>"
-            else:
-              assert optimizer_llm_name.lower() == "text-bison"
-              meta_prompt_exemplar_part += f"\ninput:\nQ: {question}\nA: <INS>"
-        else:  # when there're no "Q:" and "A:" in the prompt
-          assert instruction_pos in {"Q_begin", "Q_end"}
-          if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
-            if instruction_pos == "Q_begin":
-              meta_prompt_exemplar_part += f"\nProblem:\n<INS>\n{question}\n"
-            elif instruction_pos == "Q_end":
-              meta_prompt_exemplar_part += f"\nProblem:\n{question}\n<INS>\n"
-          else:
-            assert optimizer_llm_name.lower() == "text-bison"
-            if instruction_pos == "Q_begin":
-              meta_prompt_exemplar_part += f"\ninput:\n<INS>\n{question}\n"
-            elif instruction_pos == "Q_end":
-              meta_prompt_exemplar_part += f"\ninput:\n{question}\n<INS>\n"
-
-        if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
-          meta_prompt_exemplar_part += (
-              f"\nGround truth answer:\n{true_answer}\n"
-          )
-        else:
-          assert optimizer_llm_name.lower() == "text-bison"
-          meta_prompt_exemplar_part += f"\noutput:\n{true_answer}\n"
-
-    if few_shot_qa_pairs:
-      if instructions_before_exemplars:
-        meta_prompt += (
-            meta_prompt_old_instruction_part
-            + "\n\n"
-            + meta_prompt_exemplar_part
-        )
-      else:
-        meta_prompt += (
-            meta_prompt_exemplar_part
-            + "\n\n"
-            + meta_prompt_old_instruction_part
-        )
-    else:
-      meta_prompt += meta_prompt_old_instruction_part
-
-    if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
-      if instruction_pos == "A_begin":
-        meta_prompt += (
-            "\n\nGenerate a starting sentence that is different from all the"
-            " <Start> sentences above, and has a higher score than all the"
-            " <Start> sentences above. The starting sentence should begin with"
-            " <Start> and end with </Start>. The starting sentence should be"
-            " concise, effective, and generally applicable to all QA pairs"
-            " above."
-        )
-      else:
-        meta_prompt += (
-            "\n\nGenerate an instruction that"
-            " is different from all the instructions <INS> above,"
-            " and has a higher score than all the instructions <INS> above."
-            " The instruction should begin with <INS> and end with </INS>."
-            " The instruction should be concise, effective,"
-            " and generally applicable to all problems above."
-        )
-    else:
-      assert optimizer_llm_name.lower() == "text-bison"
-      meta_prompt += (
-          "\n\nWrite your new text that is different from the old ones and"
-          " has a score as high as possible. Write the text in square brackets."
-      )
-  else:
-    # when using a pre-trained model as optimizer
-    assert meta_prompt_type == "instructions_only"
-
-    assert instruction_pos in {"Q_begin", "Q_end", "A_begin"}
-    if instruction_pos == "Q_begin":
-      instruction_pos_description = "at the beginning of the question"
-    elif instruction_pos == "Q_end":
-      instruction_pos_description = "at the end of the question"
-    else:
-      assert instruction_pos == "A_begin"
-      instruction_pos_description = "at the beginning of the answer"
-
-    if dataset_name == "gsm8k":
-      instruction_task_description = "grade school math"
-    elif dataset_name == "mmlu":
-      instruction_task_description = task_name
-    else:
-      assert dataset_name == "bbh"
-      instruction_task_description = " ".join(task_name.split("_"))
-
-    meta_instruction = (
-        f"Create a piece of text {instruction_pos_description.strip()} to"
-        " enhance the precision in solving diverse"
-        f" {instruction_task_description.strip()} problems."
-    )
-    old_instructions_and_scores = sorted(
-        old_instructions_and_scores, key=lambda x: x[1]
-    )
-    old_instructions_and_scores_str = ""
-    for instruction, score, _ in old_instructions_and_scores:
-      if num_score_buckets == np.inf:
-        score_to_show = round(score, 2)
-      else:
-        score_to_show = _bucketize_float(score, num_score_buckets)
-      old_instructions_and_scores_str += (
-          f"\n\nPrecision: {score_to_show} <TEXT>{instruction}</TEXT>"
-      )
-    meta_prompt += meta_instruction + old_instructions_and_scores_str
-  return meta_prompt
-
-
-def run_evolution(**kwargs):
-  """The function for evolution."""
-  # ================= experiment configurations =============================
-  num_search_steps = kwargs["num_search_steps"]
-  old_instruction_score_threshold = kwargs["old_instruction_score_threshold"]
-  scorer_llm_dict = kwargs["scorer_llm_dict"]
-  optimizer_llm_dict = kwargs["optimizer_llm_dict"]
-  extract_final_answer_by_prompting_again = kwargs[
-      "extract_final_answer_by_prompting_again"
-  ]
-  include_qa = kwargs["include_qa"]
-  evaluate_in_parallel = kwargs["evaluate_in_parallel"]
-  tasks_all = kwargs["tasks_all"]
-  train_ratio = kwargs["train_ratio"]
-  eval_ratio = kwargs["eval_ratio"]
-  test_ratio = kwargs["test_ratio"]
-  train_index = kwargs["train_index"]
-  eval_index = kwargs["eval_index"]
-  dataset_name = kwargs["dataset_name"]
-  task_name = kwargs["task_name"]
-  num_examples = kwargs["num_examples"]
-  root_data_folder_path = kwargs["root_data_folder_path"]
-  optimizer_llm_temperature = kwargs["optimizer_llm_temperature"]
-  optimizer_llm_temperature_schedule = (
-      kwargs["optimizer_llm_temperature_schedule"]
-      if "optimizer_llm_temperature_schedule" in kwargs
-      else "constant"
-  )
-  optimizer_llm_temperature_end = (
-      kwargs["optimizer_llm_temperature_end"]
-      if "optimizer_llm_temperature_end" in kwargs
-      else None
-  )
-  initial_instructions = kwargs["initial_instructions"]
-  multiple_choice_tasks = kwargs["multiple_choice_tasks"]
-  raw_data = kwargs["raw_data"]
-  call_scorer_server_func = kwargs["call_scorer_server_func"]
-  call_optimizer_server_func = kwargs["call_optimizer_server_func"]
-  instruction_pos = kwargs["instruction_pos"]
-  prediction_treat_as_number = kwargs["prediction_treat_as_number"]
-  prediction_treat_as_bool = kwargs["prediction_treat_as_bool"]
-  result_by_instruction_folder = kwargs["result_by_instruction_folder"]
-  few_shot_qa_pairs = kwargs["few_shot_qa_pairs"]
-  num_score_buckets = kwargs["num_score_buckets"]
-  max_num_instructions = kwargs["max_num_instructions"]
-  meta_prompt_type = kwargs["meta_prompt_type"]
-  meta_prompt_instructions_before_exemplars = kwargs[
-      "meta_prompt_instructions_before_exemplars"
-  ]
-  few_shot_selection_criteria = kwargs["few_shot_selection_criteria"]
-  optimizer_llm_name = kwargs["optimizer_llm_name"]
-  num_generated_instructions_in_each_step = kwargs[
-      "num_generated_instructions_in_each_step"
-  ]
-  evaluate_generated_ins_on_few_shot = kwargs[
-      "evaluate_generated_ins_on_few_shot"
-  ]
-  num_few_shot_questions_for_instruction_refinement = kwargs[
-      "num_few_shot_questions_for_instruction_refinement"
-  ]
-  evaluate_old_ins_on_few_shot = kwargs["evaluate_old_ins_on_few_shot"]
-  eval_interval = kwargs["eval_interval"]
-  save_folder = kwargs["save_folder"]
-  verbose = kwargs["verbose"] if "verbose" in kwargs else False
-
-  # =================== assertions =====================
-  assert dataset_name in {
-      "mmlu",
-      "bbh",
-      "gsm8k",
-  }, "The lower-case dataset name must be one of mmlu, bbh, gsm8k."
-  assert optimizer_llm_temperature_schedule in {
-      "constant",
-      "linear_increase",
-  }, "The temperature schedule should be constant or linear_increase."
-
-  # =================== save configurations to json file ====================
-  configs_dict = dict()
-  configs_dict["scorer_llm_dict"] = scorer_llm_dict
-  configs_dict["optimizer_llm_dict"] = optimizer_llm_dict
-  configs_dict["instruction_pos"] = instruction_pos
-  configs_dict["optimizer_llm_temperature"] = optimizer_llm_temperature
-  configs_dict["optimizer_llm_temperature_schedule"] = (
-      optimizer_llm_temperature_schedule
-  )
-  configs_dict["optimizer_llm_temperature_end"] = optimizer_llm_temperature_end
-  with open(os.path.join(save_folder, "configs_dict.json"), "w") as f:
-    json.dump(configs_dict, f, indent=4)
-
-  num_servers = scorer_llm_dict["num_servers"]
-  batch_size = scorer_llm_dict["batch_size"]
-  generated_ins_on_few_shot_results_dict = dict()
-  old_ins_on_few_shot_results_dict = dict()
-  # evaluation results every a few steps
-  # format: [(i_step, instruction, detailed_results_df)]
-  eval_results = []
-  # all generated instructions, format: [(instruction, score, step_index)]
-  # the instructions that were skipped have score NaN
-  old_instructions_and_scores_raw = []
-  # the new instructions, format: [(instruction, score, step_index)]
-  old_instructions_and_scores = []
-  meta_prompts = []  # format: [(meta_prompt, step_index)]
-  instruction_score_dict = dict()  # the dictionary of {instruction: score}
-  # the dictionary of the few-shot QA indices in meta-prompt
-  # key: step index; value: the list of few-shot indices in that step
-  few_shot_index_list_by_step_dict = dict()
-  detailed_results_df_by_instruction_dict = dict()
-  wrong_questions_from_start_counter = collections.Counter()
-  # EVAL results
-  eval_detailed_results_df_dict = dict()  # {instruction: detailed_results_df}
-  instruction_eval_score_dict = dict()  # {instruction: eval_score}
-  old_instruction_md5_hashstrings_set = set()
-
-  print(f"tasks_all: {tasks_all}")
-  print(
-      f"train_ratio: {train_ratio}, number of training points:"
-      f" {int(num_examples * train_ratio)}"
-  )
-  print(
-      f"eval_ratio: {eval_ratio}, number of eval points: "
-      f"{int(num_examples * eval_ratio)}"
-  )
-  print(
-      f"test_ratio: {test_ratio}, number of test points: "
-      f"{int(num_examples * test_ratio)}"
-  )
-  print(
-      f"optimizer llm temperature: {optimizer_llm_temperature}, schedule:"
-      f" {optimizer_llm_temperature_schedule}"
-  )
-  print(
-      f"generating {num_generated_instructions_in_each_step} instructions in"
-      f" each step, run for {num_search_steps} steps"
-  )
-  print(
-      "discarding generated instructions with score less than:"
-      f" {old_instruction_score_threshold} (old_instruction_score_threshold)"
-  )
-  print(f"num_score_buckets: {num_score_buckets}")
-
-  if dataset_name == "mmlu":
-    is_multiple_choice = True
-    is_multiple_choice_eval = True
-  elif dataset_name in {"gsm8k"}:
-    is_multiple_choice = False
-    is_multiple_choice_eval = False
-  else:
-    assert dataset_name == "bbh"
-    is_multiple_choice = []
-    is_multiple_choice_eval = []
-    train_index_by_task_dict = dict()
-    eval_index_by_task_dict = dict()
-    start_index = 0
-    for task_name in tasks_all:
-      single_task_list = eval_utils.load_bbh_task_data(
-          task_name, base_dir=root_data_folder_path
-      )
-      end_index = start_index + len(single_task_list)
-      train_index_by_task_dict[task_name] = (
-          train_index[(train_index >= start_index) & (train_index < end_index)]
-          # if " - start_index" is added here, then the dict would contain
-          # indices in the original task
-      )
-      eval_index_by_task_dict[task_name] = (
-          eval_index[(eval_index >= start_index) & (eval_index < end_index)]
-          # if " - start_index" is added here, then the dict would contain
-          # indices in the original task
-      )
-      start_index = end_index
-      is_multiple_choice_single_task_train = [
-          task_name in multiple_choice_tasks
-      ] * len(train_index_by_task_dict[task_name])
-      is_multiple_choice_single_task_eval = [
-          task_name in multiple_choice_tasks
-      ] * len(eval_index_by_task_dict[task_name])
-      is_multiple_choice += is_multiple_choice_single_task_train
-      is_multiple_choice_eval += is_multiple_choice_single_task_eval
-
-  prev_saved_instructions = set()
-
-  # evaluate initial instructions
-  print("\n============== evaluating initial instructions ===============")
-  for instruction in initial_instructions:
-    print(f"""computing the score of "{instruction}" by prompting""")
-
-    detailed_results_df = eval_utils.evaluate_single_instruction(
-        data=raw_data,
-        instruction=instruction,
-        eval_index_all=train_index,
-        batch_size=batch_size,
-        call_server_func=call_scorer_server_func,
-        dataset_name=dataset_name,
-        num_servers=num_servers,
-        extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
-        include_qa=include_qa,
-        evaluate_in_parallel=evaluate_in_parallel,
-        instruction_pos=instruction_pos,
-        is_multiple_choice=is_multiple_choice,
-        prediction_treat_as_number=prediction_treat_as_number,
-        prediction_treat_as_bool=prediction_treat_as_bool,
-        prediction_num_decimals=0,
-        max_retry=120,
-        sleep_time=60,
-        verbose=verbose,
-    )
-
-    detailed_results_df_by_instruction_dict[instruction] = detailed_results_df
-    scores = detailed_results_df["accuracy"]
-    average_score = np.average(scores)
-    print(f"instruction: {instruction}, score: {average_score}")
-    filename = eval_utils.instruction_to_filename(instruction)
-    file_path = os.path.join(result_by_instruction_folder, f"{filename}.csv")
-    detailed_results_df.to_csv(file_path, index=True, header=True)
-    print(f"""saving results of "{instruction}" to {file_path}""")
-    old_instructions_and_scores.append((instruction, average_score, -1))
-    old_instructions_and_scores_raw.append((instruction, average_score, -1))
-    instruction_score_dict[instruction] = average_score
-
-    # increment the counter on wrong questions
-    wrong_question_indices_set = set(
-        list(
-            detailed_results_df.iloc[
-                np.where(detailed_results_df.accuracy == 0.0)[0], :
-            ].index
-        )
-    )
-    for idx in wrong_question_indices_set:
-      wrong_questions_from_start_counter[idx] += 1
-
-  # evolution
-  for i_step in range(num_search_steps):
-    print(f"\n================== Step {i_step} =====================")
-    if not i_step % 10:
-      print(f"old_instructions_and_scores: {old_instructions_and_scores}")
-
-    if optimizer_llm_temperature_schedule == "linear_increase":
-      optimizer_llm_temperature_curr = (
-          optimizer_llm_temperature
-          + i_step
-          / num_search_steps
-          * (optimizer_llm_temperature_end - optimizer_llm_temperature)
-      )
-    else:
-      optimizer_llm_temperature_curr = optimizer_llm_temperature
-    print(
-        f"current optimizer_llm_temperature: {optimizer_llm_temperature_curr}"
-    )
-
-    # generate new instructions
-    if few_shot_qa_pairs:
-      if few_shot_selection_criteria == "accumulative_most_frequent":
-        # select QA pairs that were done wrong the most number of times
-        most_frequent_wrong_question_indices = [
-            k
-            for k, _ in sorted(
-                wrong_questions_from_start_counter.items(), key=lambda x: -x[1]
-            )
-        ]
-        print(
-            "len(most_frequent_wrong_question_indices):"
-            f" {len(most_frequent_wrong_question_indices)}"
-        )
-        if (
-            len(most_frequent_wrong_question_indices)
-            <= num_few_shot_questions_for_instruction_refinement
-        ):
-          few_shot_index_list = most_frequent_wrong_question_indices.copy()
-        else:
-          np.random.seed(i_step)
-          few_shot_index_list = np.sort(
-              np.random.choice(
-                  most_frequent_wrong_question_indices,
-                  num_few_shot_questions_for_instruction_refinement,
-                  replace=False,
-              )
-          )
-
-      elif few_shot_selection_criteria == "current_most_frequent":
-        # show exemplars done wrong most often by currently shown instructions
-        old_instruction_score_threshold_single_step = (
-            old_instruction_score_threshold if i_step > 0 else 0
-        )
-        _, old_instructions_and_scores_in_meta_prompt = (
-            gen_ins_and_score_pairs_substr(
-                old_instructions_and_scores=old_instructions_and_scores,
-                old_instruction_score_threshold=old_instruction_score_threshold_single_step,
-                max_num_instructions=max_num_instructions,
-                return_str_only=False,
-                num_score_buckets=num_score_buckets,
-            )
-        )
-        wrong_questions_counter_single_step = collections.Counter()
-        for ins, _, _ in old_instructions_and_scores_in_meta_prompt:
-          filename = eval_utils.instruction_to_filename(ins)
-          file_path = os.path.join(
-              result_by_instruction_folder, f"{filename}.csv"
-          )
-          single_ins_df = pd.read_csv(file_path, index_col=0, header=0)
-          wrong_question_indices_set_single_old_ins = set(
-              list(
-                  single_ins_df.iloc[
-                      np.where(single_ins_df.accuracy == 0.0)[0], :
-                  ].index
-              )
-          )
-          for idx in wrong_question_indices_set_single_old_ins:
-            wrong_questions_counter_single_step[idx] += 1
-        most_occurred_wrong_questions = [
-            k
-            for k, v in wrong_questions_counter_single_step.items()
-            if v == max(wrong_questions_counter_single_step.values())
-        ]
-        if (
-            len(most_occurred_wrong_questions)
-            < num_few_shot_questions_for_instruction_refinement
-        ):
-          # pylint: disable=cell-var-from-loop
-          idx_most_to_least = sorted(
-              wrong_questions_counter_single_step,
-              key=lambda x: -wrong_questions_counter_single_step[x],
-          )
-          few_shot_index_list = idx_most_to_least[
-              :num_few_shot_questions_for_instruction_refinement
-          ]
-        else:
-          few_shot_index_list = np.sort(
-              np.random.choice(
-                  most_occurred_wrong_questions,
-                  num_few_shot_questions_for_instruction_refinement,
-                  replace=False,
-              )
-          )
-      elif few_shot_selection_criteria == "constant":
-        np.random.seed(0)
-        few_shot_index_list = np.sort(
-            np.random.choice(
-                train_index,
-                num_few_shot_questions_for_instruction_refinement,
-                replace=False,
-            )
-        )
-      else:
-        assert few_shot_selection_criteria == "random"
-        np.random.seed(i_step)
-        few_shot_index_list = np.sort(
-            np.random.choice(
-                train_index,
-                num_few_shot_questions_for_instruction_refinement,
-                replace=False,
-            )
-        ).tolist()
-
-      few_shot_index_list_by_step_dict[i_step] = few_shot_index_list
-
-      meta_prompt = gen_meta_prompt(
-          old_instructions_and_scores=old_instructions_and_scores,
-          instruction_pos=instruction_pos,
-          optimizer_llm_name=optimizer_llm_name,
-          old_instruction_score_threshold=old_instruction_score_threshold,
-          max_num_instructions=max_num_instructions,
-          meta_prompt_type=meta_prompt_type,
-          few_shot_qa_pairs=few_shot_qa_pairs,
-          include_qa=include_qa,
-          data=raw_data,
-          few_shot_index_list=few_shot_index_list,
-          instructions_before_exemplars=meta_prompt_instructions_before_exemplars,
-          num_score_buckets=num_score_buckets,
-          dataset_name=dataset_name,
-          task_name=task_name,
-      )
-
-    else:  # no few-shot exemplars in meta-prompt
-      few_shot_index_list = []
-      meta_prompt = gen_meta_prompt(
-          old_instructions_and_scores=old_instructions_and_scores,
-          instruction_pos=instruction_pos,
-          optimizer_llm_name=optimizer_llm_name,
-          old_instruction_score_threshold=old_instruction_score_threshold,
-          max_num_instructions=max_num_instructions,
-          meta_prompt_type=meta_prompt_type,
-          few_shot_qa_pairs=False,
-          include_qa=include_qa,
-          instructions_before_exemplars=meta_prompt_instructions_before_exemplars,
-          num_score_buckets=num_score_buckets,
-          dataset_name=dataset_name,
-          task_name=task_name,
-      )
-    print(f"\nmeta_prompt: \n\n{meta_prompt}\n")
-    meta_prompts.append((meta_prompt, i_step))
-    remaining_num_instructions_to_generate = (
-        num_generated_instructions_in_each_step
-    )
-    generated_instructions_raw = []
-    while remaining_num_instructions_to_generate > 0:
-      optimizer_llm_input_text = meta_prompt
-      # generate instructions
-      print(f"current temperature: {optimizer_llm_temperature_curr}")
-      raw_outputs = call_optimizer_server_func(
-          optimizer_llm_input_text,
-          temperature=optimizer_llm_temperature_curr,
-      )
-
-      # Extract the generated instructions from the optimizer LLM output. Only
-      # keep some samples if the desired number of remaining instructions
-      # is smaller than the total number of decodes in this step.
-      if meta_prompt_type == "both_instructions_and_exemplars":
-        raw_outputs = raw_outputs[:remaining_num_instructions_to_generate]
-        if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
-          if instruction_pos == "A_begin":
-            start_string = "<Start>"
-            end_string = "</Start>"
-          else:
-            start_string = "<INS>+"
-            end_string = "</INS>"
-          for raw_output in raw_outputs:
-            if start_string not in raw_output:
-              start_index = 0
-            else:
-              start_index = raw_output.index(start_string) + len(start_string)
-            if end_string not in raw_output:
-              end_index = len(raw_output)
-            else:
-              end_index = raw_output.index(end_string)
-            new_inst = raw_output[start_index:end_index].strip()
-            generated_instructions_raw.append(new_inst)
-        else:
-          assert optimizer_llm_name.lower() == "text-bison"
-          generated_instructions_raw += [
-              extract_string_in_square_brackets(string)
-              for string in raw_outputs
-          ]
-
-        remaining_num_instructions_to_generate -= optimizer_llm_dict[
-            "batch_size"
-        ]
-      else:
-        assert meta_prompt_type == "instructions_only"
-        max_num_instructions_to_keep_in_each_output = 1
-        for string in raw_outputs:
-          generated_instructions_raw += parse_tag_content(string)[
-              :max_num_instructions_to_keep_in_each_output
-          ]
-        remaining_num_instructions_to_generate -= (
-            optimizer_llm_dict["batch_size"]
-            * max_num_instructions_to_keep_in_each_output
-        )
-
-    generated_instructions_raw = list(
-        map(eval_utils.polish_sentence, generated_instructions_raw)
-    )
-    print(f"\ninitially generated instructions: {generated_instructions_raw}\n")
-
-    # do not evaluate old instructions again
-    generated_instructions = []  # the new instructions generated in this step
-    for ins in generated_instructions_raw:
-      ins_md5_hashstring = eval_utils.instruction_to_filename(
-          ins, md5_hashing=True
-      )
-      if ins_md5_hashstring not in old_instruction_md5_hashstrings_set:
-        generated_instructions.append(ins)
-        old_instruction_md5_hashstrings_set.add(ins_md5_hashstring)
-      else:
-        print(f"already evaluated '{ins}' previously")
-    generated_instructions = list(set(generated_instructions))
-
-    to_evaluate_instructions = []
-    for instruction in generated_instructions:
-      if len(instruction) > 500:
-        print(f"Step {i_step}, instruction: {instruction}, too long, skipped")
-        continue
-      if dataset_name == "gsm8k" and any(
-          char.isdigit() for char in instruction
-      ):
-        print(
-            f"Step {i_step}, instruction: {instruction}, contains numbers,"
-            " skipped"
-        )
-        continue
-      if "INS" in instruction:
-        print(
-            f"Step {i_step}, instruction: {instruction}, contains 'INS',"
-            " skipped"
-        )
-        continue
-      to_evaluate_instructions.append(instruction)
-    print(f"\nto-evaluate generated instructions: {to_evaluate_instructions}\n")
-
-    # evaluate new instructions on the few-shot exemplars in meta-prompt
-    if few_shot_qa_pairs and evaluate_generated_ins_on_few_shot:
-      print("evaluating GENERATED instructions on few-shot exemplars")
-      single_step_eval_on_few_shot = dict()
-      for instruction in to_evaluate_instructions:
-        if instruction not in prev_saved_instructions:
-          print(
-              f"evaluating Step {i_step}, instruction: {instruction} on"
-              " few-shot exemplars"
-          )
-          detailed_results_df = eval_utils.evaluate_single_instruction(
-              data=raw_data,
-              instruction=instruction,
-              eval_index_all=few_shot_index_list,
-              batch_size=batch_size,
-              call_server_func=call_scorer_server_func,
-              dataset_name=dataset_name,
-              num_servers=num_servers,
-              extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
-              include_qa=include_qa,
-              evaluate_in_parallel=evaluate_in_parallel,
-              instruction_pos=instruction_pos,
-              is_multiple_choice=is_multiple_choice,
-              prediction_treat_as_number=prediction_treat_as_number,
-              prediction_treat_as_bool=prediction_treat_as_bool,
-              prediction_num_decimals=0,
-              max_retry=5,
-              sleep_time=180,
-              verbose=verbose,
-          )
-          single_step_eval_on_few_shot[instruction] = detailed_results_df
-
-      print(
-          f"Step {i_step}, single_step_eval_on_few_shot:"
-          f" {single_step_eval_on_few_shot}\n"
-      )
-      generated_ins_on_few_shot_results_dict[i_step] = (
-          single_step_eval_on_few_shot
-      )
-
-    # evaluate OLD instructions on the few-shot exemplars in meta-prompt
-    if few_shot_qa_pairs and evaluate_old_ins_on_few_shot:
-      print("evaluating OLD instructions on few-shot exemplars")
-      single_step_eval_on_few_shot = dict()
-      for instruction, _, _ in old_instructions_and_scores:
-        print(
-            f"evaluating Step {i_step}, instruction: {instruction} on few-shot"
-            " exemplars"
-        )
-        detailed_results_df = eval_utils.evaluate_single_instruction(
-            data=raw_data,
-            instruction=instruction,
-            eval_index_all=few_shot_index_list,
-            batch_size=scorer_llm_dict["batch_size"],
-            call_server_func=call_scorer_server_func,
-            dataset_name=dataset_name,
-            num_servers=scorer_llm_dict["num_servers"],
-            extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
-            include_qa=include_qa,
-            evaluate_in_parallel=evaluate_in_parallel,
-            instruction_pos=instruction_pos,
-            is_multiple_choice=is_multiple_choice,
-            prediction_treat_as_number=prediction_treat_as_number,
-            prediction_treat_as_bool=prediction_treat_as_bool,
-            prediction_num_decimals=0,
-            max_retry=5,
-            sleep_time=180,
-            verbose=verbose,
-        )
-        single_step_eval_on_few_shot[instruction] = detailed_results_df
-
-      print(
-          f"Step {i_step}, single_step_eval_on_few_shot:"
-          f" {single_step_eval_on_few_shot}\n"
-      )
-      old_ins_on_few_shot_results_dict[i_step] = single_step_eval_on_few_shot
-
-    # evaluate newly generated instructions on the training set
-    for instruction in to_evaluate_instructions:
-      if instruction not in prev_saved_instructions:
-        print(f"""computing the score of "{instruction}" by prompting""")
-        detailed_results_df = eval_utils.evaluate_single_instruction(
-            data=raw_data,
-            instruction=instruction,
-            eval_index_all=train_index,
-            batch_size=batch_size,
-            call_server_func=call_scorer_server_func,
-            dataset_name=dataset_name,
-            num_servers=num_servers,
-            extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
-            include_qa=include_qa,
-            evaluate_in_parallel=evaluate_in_parallel,
-            instruction_pos=instruction_pos,
-            is_multiple_choice=is_multiple_choice,
-            prediction_treat_as_number=prediction_treat_as_number,
-            prediction_treat_as_bool=prediction_treat_as_bool,
-            prediction_num_decimals=0,
-            max_retry=5,
-            sleep_time=180,
-            verbose=verbose,
-        )
-        prev_saved_instructions.add(instruction)
-      else:
-        # do not re-evaluate instructions that had been evaluated previously
-        detailed_results_df = pd.read_csv(
-            os.path.join(result_by_instruction_folder, f"{instruction}.csv"),
-            index_col=0,
-            header=0,
-        )
-        print(f"""reading previously saved "{instruction}" information""")
-
-      scores = detailed_results_df["accuracy"]
-      average_score = np.average(scores)
-      print(
-          f"Step {i_step}, instruction: {instruction}, score: {average_score}"
-      )
-
-      # increment the counter on wrong questions
-      wrong_question_indices_set = set(
-          list(
-              detailed_results_df[detailed_results_df["accuracy"] == 0.0].index
-          )
-      )
-      for idx in wrong_question_indices_set:
-        wrong_questions_from_start_counter[idx] += 1
-
-      filename = eval_utils.instruction_to_filename(instruction)
-      file_path = os.path.join(
-          result_by_instruction_folder, f"""{filename}.csv"""
-      )
-      detailed_results_df.to_csv(file_path, index=True, header=True)
-      print(f"saving results to {file_path}")
-
-      detailed_results_df_by_instruction_dict[instruction] = detailed_results_df
-      old_instructions_and_scores.append((instruction, average_score, i_step))
-      instruction_score_dict[instruction] = average_score
-
-    # record all generated instructions
-    for instruction in generated_instructions_raw:
-      if instruction in instruction_score_dict:
-        average_score = instruction_score_dict[instruction]
-      else:
-        average_score = np.nan
-      old_instructions_and_scores_raw.append(
-          (instruction, average_score, i_step)
-      )
-
-    # =============================== eval ====================================
-    # every eval_interval steps, evaluate the instructions that were generated
-    # in the current step and were not skipped
-    if not i_step % eval_interval:
-      for instruction in generated_instructions_raw:
-        # if the instruction wasn't skipped in any step
-        if instruction in instruction_score_dict:
-          if instruction not in instruction_eval_score_dict:
-            detailed_results_df = eval_utils.evaluate_single_instruction(
-                data=raw_data,
-                instruction=instruction,
-                eval_index_all=eval_index,
-                batch_size=batch_size,
-                call_server_func=call_scorer_server_func,
-                dataset_name=dataset_name,
-                num_servers=num_servers,
-                extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
-                include_qa=include_qa,
-                evaluate_in_parallel=evaluate_in_parallel,
-                instruction_pos=instruction_pos,
-                is_multiple_choice=is_multiple_choice_eval,
-                prediction_treat_as_number=prediction_treat_as_number,
-                prediction_treat_as_bool=prediction_treat_as_bool,
-                prediction_num_decimals=0,
-                max_retry=5,
-                sleep_time=180,
-                verbose=verbose,
-            )
-            eval_score = np.average(detailed_results_df["accuracy"])
-            eval_detailed_results_df_dict[instruction] = detailed_results_df
-            instruction_eval_score_dict[instruction] = eval_score
-          else:
-            eval_score = instruction_eval_score_dict[instruction]
-          print(
-              f"EVAL: \nStep {i_step}, instruction: {instruction}, eval score:"
-              f" {eval_score:.2f}"
-          )
-          eval_results.append((i_step, instruction, eval_score))
-
-    # ===================== save up-to-date results ===========================
-    results_dict = dict()
-    results_dict["meta_prompts"] = meta_prompts
-    results_dict["old_instructions_and_scores"] = list(
-        old_instructions_and_scores
-    )
-    results_dict["old_instructions_and_scores_raw"] = list(
-        old_instructions_and_scores_raw
-    )
-    results_dict["generated_ins_on_few_shot_results_dict"] = (
-        generated_ins_on_few_shot_results_dict
-    )
-    results_dict["old_ins_on_few_shot_results_dict"] = (
-        old_ins_on_few_shot_results_dict
-    )
-    results_dict["few_shot_index_list_by_step_dict"] = (
-        few_shot_index_list_by_step_dict
-    )
-    results_dict["eval_results"] = eval_results
-    results_dict["eval_detailed_results_df_dict"] = (
-        eval_detailed_results_df_dict
-    )
-    with open(os.path.join(save_folder, "results_dict.pkl"), "wb") as fp:
-      pickle.dump(results_dict, fp)
-    print(f"\nsaved all results to\n{save_folder}")
diff --git a/optimization/optimize_instructions.py b/optimization/optimize_instructions.py
deleted file mode 100644
index e619ffb..0000000
--- a/optimization/optimize_instructions.py
+++ /dev/null
@@ -1,736 +0,0 @@
-
-import datetime
-import functools
-import os
-import sys
-
-OPRO_ROOT_PATH = os.path.dirname(
-    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-)
-sys.path.insert(0, OPRO_ROOT_PATH)
-
-from absl import app
-from absl import flags
-import google.generativeai as palm
-import numpy as np
-import openai
-from opro import prompt_utils
-from opro.optimization import opt_utils
-import pandas as pd
-FLAGS = flags.FLAGS
-ROOT_DATA_FOLDER_PATH = os.path.join(OPRO_ROOT_PATH, "data")
-
-flags.DEFINE_string("local_model_path", "", "Path to local vLLM model.")
-
-_OPENAI_API_KEY = flags.DEFINE_string(
-    "openai_api_key", "", "The OpenAI API key."
-)
-
-_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
-
-_SCORER = flags.DEFINE_string(
-    "scorer", "text-bison", "The name of the scorer LLM."
-)
-
-_OPTIMIZER = flags.DEFINE_string(
-    "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
-)
-
-_DATASET = flags.DEFINE_string(
-    "dataset", "gsm8k", "The name of dataset to search for instructions on."
-)
-
-_TASK = flags.DEFINE_string(
-    "task",
-    "train",
-    "The name of task within the above dataset to search for instructions on.",
-)
-
-_INSTRUCTION_POS = flags.DEFINE_string(
-    "instruction_pos",
-    "A_begin",
-    "The position of the instruction to search for.",
-)
-
-_META_PROMPT_TYPE = flags.DEFINE_string(
-    "meta_prompt_type",
-    "both_instructions_and_exemplars",
-    "The type of meta-prompt: whether to have both previous instructions and"
-    " dataset exemplars (often for fine-tuned optimizers), or to have only"
-    " previous instructions (often for pre-trained optimizers).",
-)
-
-
-def main(_):
-  local_model_path = FLAGS.local_model_path
-  openai_api_key = _OPENAI_API_KEY.value
-  palm_api_key = _PALM_API_KEY.value
-  scorer_llm_name = _SCORER.value
-  optimizer_llm_name = _OPTIMIZER.value
-  dataset_name = _DATASET.value.lower()
-  task_name = _TASK.value
-  meta_prompt_type = _META_PROMPT_TYPE.value
-
-  assert dataset_name in {
-      "mmlu",
-      "bbh",
-      "gsm8k",
-  }, "The lower-case dataset name must be one of mmlu, bbh, or gsm8k."
-  if dataset_name == "mmlu":
-    assert task_name in {
-        "STEM",
-        "humanities",
-        "social sciences",
-        "other (business, health, misc.)",
-    }  # for now only support searching on one MMLU category
-  elif dataset_name == "bbh":
-    assert task_name in {
-        "boolean_expressions",
-        "causal_judgement",
-        "date_understanding",
-        "disambiguation_qa",
-        "dyck_languages",
-        "formal_fallacies",
-        "geometric_shapes",
-        "hyperbaton",
-        "logical_deduction_five_objects",
-        "logical_deduction_seven_objects",
-        "logical_deduction_three_objects",
-        "movie_recommendation",
-        "multistep_arithmetic_two",
-        "navigate",
-        "object_counting",
-        "penguins_in_a_table",
-        "reasoning_about_colored_objects",
-        "ruin_names",
-        "salient_translation_error_detection",
-        "snarks",
-        "sports_understanding",
-        "temporal_sequences",
-        "tracking_shuffled_objects_five_objects",
-        "tracking_shuffled_objects_seven_objects",
-        "tracking_shuffled_objects_three_objects",
-        "web_of_lies",
-        "word_sorting",
-    }
-  else:
-    assert dataset_name == "gsm8k"
-    assert task_name in {"train", "test"}
-
-  assert scorer_llm_name in {
-      "text-bison",
-      "gpt-3.5-turbo",
-      "gpt-4",
-       "local",
-  }
-  assert optimizer_llm_name in {
-      "text-bison",
-      "gpt-3.5-turbo",
-      "gpt-4",
-       "local",
-  }
-  assert meta_prompt_type in {
-      "both_instructions_and_exemplars",
-      "instructions_only",
-  }
-
-  instruction_pos = _INSTRUCTION_POS.value
-  assert instruction_pos in {
-      "before_Q",
-      "Q_begin",
-      "Q_end",
-      "A_begin",
-  }, (
-      "The instruction position should be either before the question, or at the"
-      " beginning of the question, at the end of the question, or at the"
-      " beginning of the answer."
-  )
-  print(
-      f"scorer: {scorer_llm_name}, optimizer: {optimizer_llm_name}, dataset:"
-      f" {dataset_name}, task: {task_name}, instruction_pos: {instruction_pos}"
-  )
-
-
-  if scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
-      assert openai_api_key, "The OpenAI API key must be provided."
-      openai.api_key = openai_api_key
-  elif scorer_llm_name == "text-bison":
-      assert palm_api_key, "A PaLM API key is needed when prompting the text-bison model."
-      palm.configure(api_key=palm_api_key)
-  elif scorer_llm_name == "local":
-      # 本地模型，无需 API key
-      pass
-  else:
-      raise ValueError(f"Unknown scorer model: {scorer_llm_name}")
-
-  if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
-      assert openai_api_key, "The OpenAI API key must be provided."
-      openai.api_key = openai_api_key
-  elif optimizer_llm_name == "text-bison":
-      assert palm_api_key, "A PaLM API key is needed when prompting the text-bison model."
-      palm.configure(api_key=palm_api_key)
-  elif optimizer_llm_name == "local":
-      # 本地模型，无需 API key
-      pass
-  else:
-      raise ValueError(f"Unknown scorer model: {optimizer_llm_name}")
-
-
-  if dataset_name == "mmlu":
-    root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "MMLU-data")
-  elif dataset_name == "bbh":
-    root_data_folder_path = os.path.join(
-        ROOT_DATA_FOLDER_PATH, "BIG-Bench-Hard-data/"
-    )
-  else:
-    assert dataset_name == "gsm8k"
-    root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "gsm_data")
-
-  # =================== create the result directory ==========================
-  datetime_str = (
-      str(datetime.datetime.now().replace(microsecond=0))
-      .replace(" ", "-")
-      .replace(":", "-")
-  )
-
-
-  save_folder = os.path.join(
-    OPRO_ROOT_PATH,
-    "outputs",
-    "optimization-results",
-    f"{dataset_name.upper()}-{task_name}-s-{scorer_llm_name}-o-{optimizer_llm_name}-{datetime_str}/",
-)
-
-
-  result_by_instruction_folder = os.path.join(
-      save_folder, "result_by_instruction"
-  )
-  print(f"Results will be saved to: {os.path.abspath(result_by_instruction_folder)}")
-  os.makedirs(result_by_instruction_folder,exist_ok=True)
-  print(f"result directory:\n{save_folder}")
-
-  # ====================== scorer model configs ==============================
-
-
-  if scorer_llm_name == "text-bison":
-    # when prompting text-bison with Cloud API
-    scorer_finetuned_palm_temperature = 0.0
-    scorer_finetuned_palm_max_decode_steps = 1024
-    scorer_finetuned_palm_batch_size = 1
-    scorer_finetuned_palm_num_servers = 1
-    scorer_finetuned_palm_dict = dict()
-    scorer_finetuned_palm_dict["temperature"] = (
-        scorer_finetuned_palm_temperature
-    )
-    scorer_finetuned_palm_dict["num_servers"] = (
-        scorer_finetuned_palm_num_servers
-    )
-    scorer_finetuned_palm_dict["batch_size"] = scorer_finetuned_palm_batch_size
-    scorer_finetuned_palm_dict["max_decode_steps"] = (
-        scorer_finetuned_palm_max_decode_steps
-    )
-
-    call_scorer_finetuned_palm_server_func = functools.partial(
-        prompt_utils.call_palm_server_from_cloud,
-        model="text-bison-001",
-        temperature=scorer_finetuned_palm_dict["temperature"],
-        max_decode_steps=scorer_finetuned_palm_dict["max_decode_steps"],
-    )
-
-    scorer_llm_dict = {
-        "model_type": scorer_llm_name.lower(),
-    }
-    scorer_llm_dict.update(scorer_finetuned_palm_dict)
-    call_scorer_server_func = call_scorer_finetuned_palm_server_func
-
- 
-  elif scorer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
-    # 改成调用本地vLLM版本的函数
-    scorer_gpt_max_decode_steps = 1024
-    # scorer_gpt_max_decode_steps = 512
-    scorer_gpt_temperature = 0.0
-
-    scorer_llm_dict = {
-        "model_type": scorer_llm_name.lower(),
-        "max_decode_steps": scorer_gpt_max_decode_steps,
-        "temperature": scorer_gpt_temperature,
-        "num_decodes": 1,
-        "batch_size": 1,
-        "num_servers": 1,
-    }
-
-    call_scorer_server_func = functools.partial(
-        prompt_utils.call_openai_server_func,  # 你本地实现的vLLM调用函数
-        max_decode_steps=scorer_gpt_max_decode_steps,
-        temperature=scorer_gpt_temperature,
-        local_model_path=FLAGS.local_model_path,  # 传入你本地模型路径
-    )
-
-  else:
-      raise ValueError(f"Unsupported scorer_llm_name: {scorer_llm_name}")
-
-
-  # ====================== optimizer model configs ============================
-  if optimizer_llm_name.lower() == "text-bison":
-    # when prompting text-bison with Cloud API
-    optimizer_finetuned_palm_temperature = 1.0
-    optimizer_finetuned_palm_num_decodes = 8
-    optimizer_finetuned_palm_max_decode_steps = 1024
-    optimizer_finetuned_palm_batch_size = 1
-    optimizer_finetuned_palm_num_servers = 1
-    optimizer_finetuned_palm_dict = dict()
-    optimizer_finetuned_palm_dict["temperature"] = (
-        optimizer_finetuned_palm_temperature
-    )
-    optimizer_finetuned_palm_dict["num_decodes"] = (
-        optimizer_finetuned_palm_num_decodes
-    )
-    optimizer_finetuned_palm_dict["batch_size"] = (
-        optimizer_finetuned_palm_batch_size
-    )
-    optimizer_finetuned_palm_dict["num_servers"] = (
-        optimizer_finetuned_palm_num_servers
-    )
-    optimizer_finetuned_palm_dict["max_decode_steps"] = (
-        optimizer_finetuned_palm_max_decode_steps
-    )
-
-    call_optimizer_finetuned_palm_server_func = functools.partial(
-        prompt_utils.call_palm_server_from_cloud,
-        model="text-bison-001",
-        temperature=optimizer_finetuned_palm_dict["temperature"],
-        max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
-    )
-
-    optimizer_llm_dict = {
-        "model_type": optimizer_llm_name.lower(),
-    }
-    optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
-    call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
-  
-  elif optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
-    # 用本地 vLLM 版本替代调用
-    optimizer_gpt_max_decode_steps = 512
-
-    optimizer_gpt_temperature = 1.0
-
-    optimizer_llm_dict = {
-        "max_decode_steps": optimizer_gpt_max_decode_steps,
-        "temperature": optimizer_gpt_temperature,
-        "batch_size": 1,
-        "num_decodes": 1,
-    }
-
-    call_optimizer_server_func = functools.partial(
-        prompt_utils.call_openai_server_func,  # 你写的本地vLLM调用接口
-        max_decode_steps=optimizer_gpt_max_decode_steps,
-        temperature=optimizer_gpt_temperature,
-        local_model_path=FLAGS.local_model_path,
-    )
-
-  else:
-      raise ValueError(f"Unsupported optimizer_llm_name: {optimizer_llm_name}")
-
-  
-
-  # ====================== try calling the servers ============================
-  print("\n======== testing the scorer and optimizer servers ===========")
-  scorer_test_output = call_scorer_server_func(
-      "Does the sun rise from the north? Just answer yes or no."
-  )
-  print(f"number of scorer output decodes: {len(scorer_test_output)}")
-  print(f"scorer test output: {scorer_test_output}")
-  optimizer_test_output = call_optimizer_server_func(
-      "Does the sun rise from the north? Just answer yes or no.",
-      temperature=1.0,
-  )
-  print(f"number of optimizer output decodes: {len(optimizer_test_output)}")
-  print(f"optimizer test output: {optimizer_test_output}")
-  print("Finished testing the servers.")
-
-  # ====================== read data ============================
-  print("\n================ prompt optimization settings ==============")
-  # from https://github.com/hendrycks/test/blob/master/categories.py
-  subcategories = {
-      "abstract_algebra": ["math"],
-      "anatomy": ["health"],
-      "astronomy": ["physics"],
-      "business_ethics": ["business"],
-      "clinical_knowledge": ["health"],
-      "college_biology": ["biology"],
-      "college_chemistry": ["chemistry"],
-      "college_computer_science": ["computer science"],
-      "college_mathematics": ["math"],
-      "college_medicine": ["health"],
-      "college_physics": ["physics"],
-      "computer_security": ["computer science"],
-      "conceptual_physics": ["physics"],
-      "econometrics": ["economics"],
-      "electrical_engineering": ["engineering"],
-      "elementary_mathematics": ["math"],
-      "formal_logic": ["philosophy"],
-      "global_facts": ["other"],
-      "high_school_biology": ["biology"],
-      "high_school_chemistry": ["chemistry"],
-      "high_school_computer_science": ["computer science"],
-      "high_school_european_history": ["history"],
-      "high_school_geography": ["geography"],
-      "high_school_government_and_politics": ["politics"],
-      "high_school_macroeconomics": ["economics"],
-      "high_school_mathematics": ["math"],
-      "high_school_microeconomics": ["economics"],
-      "high_school_physics": ["physics"],
-      "high_school_psychology": ["psychology"],
-      "high_school_statistics": ["math"],
-      "high_school_us_history": ["history"],
-      "high_school_world_history": ["history"],
-      "human_aging": ["health"],
-      "human_sexuality": ["culture"],
-      "international_law": ["law"],
-      "jurisprudence": ["law"],
-      "logical_fallacies": ["philosophy"],
-      "machine_learning": ["computer science"],
-      "management": ["business"],
-      "marketing": ["business"],
-      "medical_genetics": ["health"],
-      "miscellaneous": ["other"],
-      "moral_disputes": ["philosophy"],
-      "moral_scenarios": ["philosophy"],
-      "nutrition": ["health"],
-      "philosophy": ["philosophy"],
-      "prehistory": ["history"],
-      "professional_accounting": ["other"],
-      "professional_law": ["law"],
-      "professional_medicine": ["health"],
-      "professional_psychology": ["psychology"],
-      "public_relations": ["politics"],
-      "security_studies": ["politics"],
-      "sociology": ["culture"],
-      "us_foreign_policy": ["politics"],
-      "virology": ["health"],
-      "world_religions": ["philosophy"],
-  }
-
-  categories = {
-      "STEM": [
-          "physics",
-          "chemistry",
-          "biology",
-          "computer science",
-          "math",
-          "engineering",
-      ],
-      "humanities": ["history", "philosophy", "law"],
-      "social sciences": [
-          "politics",
-          "culture",
-          "economics",
-          "geography",
-          "psychology",
-      ],
-      "other (business, health, misc.)": ["other", "business", "health"],
-  }
-
-  if dataset_name == "mmlu":
-
-    category_names = [task_name]
-    folder_name = "test"  # one of {'auxiliary_train', 'dev', 'val', 'test'}
-    task_names = []
-    for task_csv_name in os.listdir(
-        os.path.join(root_data_folder_path, folder_name)
-    ):
-      task_names.append(task_csv_name.split(".")[0])
-
-    tasks_in_category = []
-    for category_name in category_names:
-      for task_name in task_names:
-        for subname in subcategories:
-          if subname in task_name:
-            if subcategories[subname][0] in categories[category_name]:
-              tasks_in_category.append(task_name)
-              break
-
-    tasks_all = [(folder_name, task_name) for task_name in tasks_in_category]
-    multiple_choice_tasks = set([item[1] for item in tasks_all])
-    boolean_tasks = set()
-    numerical_output_tasks = set()
-
-    
-  elif dataset_name == "bbh":
-    tasks_all = [task_name]
-    assert (
-        len(tasks_all) == 1
-    ), "for now only support prompt optimization on one BBH task"
-
-    
-    numerical_output_tasks = {
-        "object_counting",
-        "multistep_arithmetic_two",
-    }
-
-    multiple_choice_tasks = {
-        "date_understanding",
-        "disambiguation_qa",
-        "geometric_shapes",
-        "hyperbaton",
-        "logical_deduction_five_objects",
-        "logical_deduction_seven_objects",
-        "logical_deduction_three_objects",
-        "movie_recommendation",
-        "penguins_in_a_table",
-        "reasoning_about_colored_objects",
-        "ruin_names",
-        "salient_translation_error_detection",
-        "snarks",
-        "temporal_sequences",
-        "tracking_shuffled_objects_five_objects",
-        "tracking_shuffled_objects_seven_objects",
-        "tracking_shuffled_objects_three_objects",
-    }
-
-    boolean_tasks = {
-        "boolean_expressions",  # True or False
-        "causal_judgement",  # yes or no
-        "formal_fallacies",  # valid or invalid
-        "navigate",  # yes or no
-        "sports_understanding",  # yes or no
-        "web_of_lies",  # yes or no
-    }
-
-  else:
-    assert dataset_name in {"gsm8k"}
-    tasks_all = [task_name]
-    multiple_choice_tasks = set()
-    boolean_tasks = set()
-    numerical_output_tasks = set(tasks_all)
-
-  if dataset_name == "mmlu":
-    raw_data = pd.DataFrame()
-    prediction_treat_as_number = False
-    prediction_treat_as_bool = False
-  elif dataset_name == "bbh":
-    raw_data = []
-    prediction_treat_as_number = bool(
-        tasks_all[0] in numerical_output_tasks
-    )  # for now only check the first task
-    prediction_treat_as_bool = bool(
-        tasks_all[0] in boolean_tasks
-    )  # for now only check the first task
-    print(
-        f"prediction_treat_as_number: {prediction_treat_as_number},"
-        f" prediction_treat_as_bool: {prediction_treat_as_bool}"
-    )
-  else:
-    assert dataset_name == "gsm8k"
-    raw_data = pd.DataFrame()
-    prediction_treat_as_number = True
-    prediction_treat_as_bool = False
-
-  for t in tasks_all:
-    if dataset_name == "mmlu":
-      folder_name = t[0]
-      task_name = t[1]
-      single_task_df = pd.read_csv(
-          os.path.join(root_data_folder_path, f"{folder_name}/{task_name}.csv"),
-          index_col=None,
-          header=None,
-      )
-      raw_data = pd.concat([raw_data, single_task_df])
-    elif dataset_name == "bbh":
-      task_name = t
-      single_task_list = opt_utils.load_bbh_task_data(
-          task_name, base_dir=root_data_folder_path
-      )
-      raw_data += single_task_list
-    else:
-      assert dataset_name == "gsm8k"
-      task_name = t
-      f_gsm = os.path.join(root_data_folder_path, f"gsm_{task_name}.tsv")
-      single_task_df = pd.read_csv(f_gsm, sep="\t", header=None)
-      raw_data = pd.concat([raw_data, single_task_df])
-
-  if dataset_name == "mmlu":
-    num_examples = raw_data.shape[0]
-  elif dataset_name == "bbh":
-    num_examples = len(raw_data)
-  else:
-    assert dataset_name in {"gsm8k"}
-    num_examples = raw_data.shape[0]
-  print(f"number of examples in the current task: {num_examples}")
-
-  # ================ split data into train/val/test ==========================
-  if dataset_name == "mmlu":
-    train_ratio = 0.8
-    
-    eval_ratio = 0.2
-  elif dataset_name == "gsm8k":
-#    train_ratio = 0.035
-    train_ratio = 0.01               # 原来是 0.035，改成 0.01，约 74 条
-    eval_ratio = 0
-  else:
-    assert dataset_name == "bbh"
-    train_ratio = 0.2
-    eval_ratio = 0
-
- 
-  assert train_ratio + eval_ratio <= 1
-  test_ratio = 1 - train_ratio - eval_ratio
-  print(
-      f"train_ratio: {train_ratio}, eval_ratio: {eval_ratio}, "
-      f"test_ratio: {test_ratio}"
-  )
-  np.random.seed(0)
-  train_index = np.sort(
-      np.array(
-          np.random.choice(
-              num_examples, size=int(train_ratio * num_examples), replace=False
-          )
-      )
-  )
-  eval_and_test_index = np.sort(
-      np.array(list(set(np.arange(num_examples)) - set(train_index)))
-  )
-  eval_index = np.sort(
-      np.array(
-          np.random.choice(
-              eval_and_test_index,
-              size=int(eval_ratio * num_examples),
-              replace=False,
-          )
-      )
-  )
-
-  # ========== set other optimization experiment hyperparameters ==============
-  if scorer_llm_name == "text-bison":
-    old_instruction_score_threshold = 0.0
-    # old_instruction_score_threshold = 0.15  # for GSM8K
-  else:
-    assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4", "local"}
-    old_instruction_score_threshold = 0.3
-
-  if scorer_llm_name == "text-bison":
-    extract_final_answer_by_prompting_again = False
-    include_qa = False
-    evaluate_in_parallel = False
-  else:
-    assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4", "local"}
-    extract_final_answer_by_prompting_again = False
-    include_qa = False
-    evaluate_in_parallel = False
-
-  optimizer_llm_temperature = optimizer_llm_dict["temperature"]
-
-
-#  num_few_shot_questions_for_instruction_refinement = 3
-  num_few_shot_questions_for_instruction_refinement = 1  # 减少 few-shot 例子数
-
-#  num_generated_instructions_in_each_step = 8
-  num_generated_instructions_in_each_step = 2  # 每步只生成 2 条指令
-
-#  num_search_steps = 200
-  num_search_steps = 3            # 原来是 200，改成 3 步即可
-
-
-  initial_instructions = [
-      "Let's solve the problem.",
-      # "",
-      # "The answer is",
-  ]
-  few_shot_qa_pairs = True
-  # one of {'accumulative_most_frequent', 'current_most_frequent', 'random',
-  # 'constant'}
-  few_shot_selection_criteria = "random"
-  # whether to evaluate generated instructions on the exemplars in meta-prompt
-  evaluate_generated_ins_on_few_shot = False
-  # whether to evaluate old instructions on the exemplars in the meta-prompt
-  evaluate_old_ins_on_few_shot = False
-  # every this number of steps, compute the accuracies of current-step
-  # instructions on the validation set
-#  eval_interval = 3
-  eval_interval = 1               # 每步就 eval 一次，及时看到结果
-  # eval_interval = 10
-  max_num_instructions = (
-      20  # the maximum number of instructions and scores in the meta-prompt
-  )
-  # The number of buckets when converting scores to integers in the meta-prompt.
-  num_score_buckets = 100
-  # whether to put old instructions and scores to before exemplars in
-  # the meta-prompt
-  meta_prompt_instructions_before_exemplars = True
-
-  # ===================== run prompt optimization ======================
-
-  assert few_shot_selection_criteria in {
-      "accumulative_most_frequent",
-      "current_most_frequent",
-      "random",
-      "constant",
-  }
-  evolution_kwargs = {
-      "num_search_steps": num_search_steps,
-      "old_instruction_score_threshold": old_instruction_score_threshold,
-      "scorer_llm_dict": scorer_llm_dict,
-      "optimizer_llm_dict": optimizer_llm_dict,
-      "extract_final_answer_by_prompting_again": (
-          extract_final_answer_by_prompting_again
-      ),
-      "include_qa": include_qa,
-      "evaluate_in_parallel": evaluate_in_parallel,
-      "tasks_all": tasks_all,
-      "train_ratio": train_ratio,
-      "eval_ratio": eval_ratio,
-      "test_ratio": test_ratio,
-      "train_index": train_index,
-      "eval_index": eval_index,
-      "dataset_name": dataset_name,
-      "task_name": task_name,
-      "num_examples": num_examples,
-      "root_data_folder_path": root_data_folder_path,
-      "optimizer_llm_temperature": optimizer_llm_temperature,
-      # "optimizer_llm_temperature_schedule": (
-      #     optimizer_llm_temperature_schedule
-      # ),
-      # "optimizer_llm_temperature_end": optimizer_llm_temperature_end,
-      "initial_instructions": initial_instructions,
-      "multiple_choice_tasks": multiple_choice_tasks,
-      "raw_data": raw_data,
-      "call_scorer_server_func": call_scorer_server_func,
-      "call_optimizer_server_func": call_optimizer_server_func,
-      "instruction_pos": instruction_pos,
-      "prediction_treat_as_number": prediction_treat_as_number,
-      "prediction_treat_as_bool": prediction_treat_as_bool,
-      "result_by_instruction_folder": result_by_instruction_folder,
-      "few_shot_qa_pairs": few_shot_qa_pairs,
-      "num_score_buckets": num_score_buckets,
-      "max_num_instructions": max_num_instructions,
-      "meta_prompt_type": meta_prompt_type,
-      "meta_prompt_instructions_before_exemplars": (
-          meta_prompt_instructions_before_exemplars
-      ),
-      "few_shot_selection_criteria": few_shot_selection_criteria,
-      "optimizer_llm_name": optimizer_llm_name,
-      "num_generated_instructions_in_each_step": (
-          num_generated_instructions_in_each_step
-      ),
-      "evaluate_generated_ins_on_few_shot": evaluate_generated_ins_on_few_shot,
-      "num_few_shot_questions_for_instruction_refinement": (
-          num_few_shot_questions_for_instruction_refinement
-      ),
-      "evaluate_old_ins_on_few_shot": evaluate_old_ins_on_few_shot,
-      "eval_interval": eval_interval,
-      "save_folder": save_folder,
-  }
-  print("=== 开始优化过程 ===")
-  try:
-    opt_utils.run_evolution(**evolution_kwargs)
-    print("=== 优化完成 ===")
-  except Exception as e:
-    import traceback
-    print(f"!!! 优化失败: {e} !!!", file=sys.stderr)
-    traceback.print_exc()
-    sys.exit(1)
-
-
-if __name__ == "__main__":
-  app.run(main)
-
diff --git a/optimization/optimize_linear_regression.py b/optimization/optimize_linear_regression.py
deleted file mode 100644
index f856434..0000000
--- a/optimization/optimize_linear_regression.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# Copyright 2023 The OPRO Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-r"""Optimize over the objective function of a linear regression problem.
-
-Usage:
-
-```
-python optimize_linear_regression.py --optimizer="text-bison"
-```
-
-Note:
-- When using a Google-Cloud-served model (like text-bison at
-https://developers.generativeai.google/tutorials/text_quickstart), add
-`--palm_api_key="<your_key>"`
-- When using an OpenAI model, add `--openai_api_key="<your_key>"`
-"""
-
-import datetime
-import functools
-import json
-import os
-import re
-import sys
-
-OPRO_ROOT_PATH = os.path.dirname(
-    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-)
-sys.path.insert(0, OPRO_ROOT_PATH)
-
-from absl import app
-from absl import flags
-import google.generativeai as palm
-import numpy as np
-import openai
-
-from opro import prompt_utils
-
-_OPENAI_API_KEY = flags.DEFINE_string(
-    "openai_api_key", "", "The OpenAI API key."
-)
-
-_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
-
-_OPTIMIZER = flags.DEFINE_string(
-    "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
-)
-
-
-def main(_):
-  # ============== set optimization experiment configurations ================
-  num_points = 50  # number of points in linear regression
-  w_true = 15  # the true w
-  b_true = 14  # the true b
-  max_num_steps = 500  # the number of optimization steps
-  num_reps = 5  # the number of repeated runs
-  max_num_pairs = 20  # the maximum number of input-output pairs in meta-prompt
-  num_input_decimals = 0  # num of decimals for input values in meta-prompt
-  num_output_decimals = 0  # num of decimals for output values in meta-prompt
-  num_generated_points_in_each_step = 8
-
-  # ================ load LLM settings ===================
-  optimizer_llm_name = _OPTIMIZER.value
-  assert optimizer_llm_name in {
-      "text-bison",
-      "gpt-3.5-turbo",
-      "gpt-4",
-  }
-  openai_api_key = _OPENAI_API_KEY.value
-  palm_api_key = _PALM_API_KEY.value
-
-  if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
-    assert openai_api_key, "The OpenAI API key must be provided."
-    openai.api_key = openai_api_key
-  else:
-    assert optimizer_llm_name == "text-bison"
-    assert (
-        palm_api_key
-    ), "A PaLM API key is needed when prompting the text-bison model."
-    palm.configure(api_key=palm_api_key)
-
-  # =================== create the result directory ==========================
-  datetime_str = (
-      str(datetime.datetime.now().replace(microsecond=0))
-      .replace(" ", "-")
-      .replace(":", "-")
-  )
-
-  save_folder = os.path.join(
-      OPRO_ROOT_PATH,
-      "outputs",
-      "optimization-results",
-      f"linear_regression-o-{optimizer_llm_name}-{datetime_str}/",
-  )
-  os.makedirs(save_folder)
-  print(f"result directory:\n{save_folder}")
-
-  # ====================== optimizer model configs ============================
-  if optimizer_llm_name.lower() == "text-bison":
-    # when prompting text-bison with Cloud API
-    optimizer_finetuned_palm_temperature = 1.0
-    optimizer_finetuned_palm_max_decode_steps = 1024
-    optimizer_finetuned_palm_batch_size = 1
-    optimizer_finetuned_palm_num_servers = 1
-    optimizer_finetuned_palm_dict = dict()
-    optimizer_finetuned_palm_dict["temperature"] = (
-        optimizer_finetuned_palm_temperature
-    )
-    optimizer_finetuned_palm_dict["batch_size"] = (
-        optimizer_finetuned_palm_batch_size
-    )
-    optimizer_finetuned_palm_dict["num_servers"] = (
-        optimizer_finetuned_palm_num_servers
-    )
-    optimizer_finetuned_palm_dict["max_decode_steps"] = (
-        optimizer_finetuned_palm_max_decode_steps
-    )
-
-    call_optimizer_finetuned_palm_server_func = functools.partial(
-        prompt_utils.call_palm_server_from_cloud,
-        # prompt_utils.call_vllm,
-        model="text-bison-001",
-        temperature=optimizer_finetuned_palm_dict["temperature"],
-        max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
-    )
-
-    optimizer_llm_dict = {
-        "model_type": optimizer_llm_name.lower(),
-    }
-    optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
-    call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
-
-  else:
-    assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
-    optimizer_gpt_max_decode_steps = 1024
-    optimizer_gpt_temperature = 1.0
-
-    optimizer_llm_dict = dict()
-    optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps
-    optimizer_llm_dict["temperature"] = optimizer_gpt_temperature
-    optimizer_llm_dict["batch_size"] = 1
-    call_optimizer_server_func = functools.partial(
-        prompt_utils.call_openai_server_func,
-        model=optimizer_llm_name,
-        max_decode_steps=optimizer_gpt_max_decode_steps,
-        temperature=optimizer_gpt_temperature,
-    )
-
-  # ====================== try calling the servers ============================
-  print("\n======== testing the optimizer server ===========")
-  optimizer_test_output = call_optimizer_server_func(
-      "Does the sun rise from the north? Just answer yes or no.",
-      temperature=1.0,
-  )
-  print(f"optimizer test output: {optimizer_test_output}")
-  print("Finished testing the optimizer server.")
-  print("\n=================================================")
-
-  # ====================== utility functions ============================
-  def evaluate_loss(X, y, w, b):  # pylint: disable=invalid-name
-    residual = y - (X * w + b)
-    return np.linalg.norm(residual) ** 2
-
-  def gen_meta_prompt(
-      old_value_pairs_set,
-      X,  # pylint: disable=invalid-name, unused-argument
-      y,  # pylint: disable=unused-argument
-      num_input_decimals=5,
-      num_output_decimals=5,
-      max_num_pairs=100,
-  ):
-    """Generate the meta-prompt for optimization.
-
-    Args:
-     old_value_pairs_set (set): the set of old (w, b, z) pairs.
-     X (np.array): the 1D array of x values.
-     y (np.array): the 1D array of y values.
-     num_input_decimals (int): the number of decimals for (w, b) in the
-       meta-prompt.
-     num_output_decimals (int): the number of decimals for z in the meta-prompt.
-     max_num_pairs (int): the maximum number of exemplars in the meta-prompt.
-
-    Returns:
-      meta_prompt (str): the generated meta-prompt.
-    """
-    old_value_pairs_set = set(
-        [  # pylint: disable=g-complex-comprehension
-            (
-                np.round(w, num_input_decimals)
-                if num_input_decimals > 0
-                else int(w),
-                np.round(b, num_input_decimals)
-                if num_input_decimals > 0
-                else int(b),
-                np.round(z, num_output_decimals)
-                if num_output_decimals > 0
-                else int(z),
-            )
-            for w, b, z in old_value_pairs_set
-        ]
-    )
-    old_value_pairs = list(old_value_pairs_set)
-    old_value_pairs = sorted(old_value_pairs, key=lambda x: -x[2])[
-        -max_num_pairs:
-    ]
-    old_value_pairs_substr = ""
-    for w, b, z in old_value_pairs:
-      old_value_pairs_substr += f"\ninput:\nw={w}, b={b}\nvalue:\n{z}\n"
-    meta_prompt = """
-  Now you will help me minimize a function with two input variables w, b. I have some (w, b) pairs and the function values at those points. The pairs are arranged in descending order based on their function values, where lower values are better.
-    """.strip()
-    meta_prompt += "\n\n"
-    meta_prompt += old_value_pairs_substr.strip()
-    meta_prompt += "\n\n"
-    # function_analytic_form = ""
-    # for xi, yi in zip(X, y):
-    #   function_analytic_form += f"({yi:.4f} - ({xi:.4f} * w + b)) ** 2 + "
-    # function_analytic_form = function_analytic_form[:-3]
-    # meta_prompt += (
-    #     "The function has the analytic form f(w, b) ="
-    #     f" {function_analytic_form}. When evaluating the value of a (w, b)"
-    #     " pair, you should replace the w and b in the analytic form with your"
-    #     " values and do the computation."
-    # )
-    # meta_prompt += "\n\n"
-    meta_prompt += """Give me a new (w, b) pair that is different from all pairs above, and has a function value lower than any of the above. Do not write code. The output must end with a pair [w, b], where w and b are numerical values.
-    """.strip()
-    return meta_prompt
-
-  def extract_string_in_square_brackets(input_string):
-    raw_result = re.findall(r"\[.*?\]", input_string)
-    if raw_result:
-      for pair in raw_result[::-1]:
-        if "=" not in pair and ("w" in pair or "b" in pair):
-          continue
-        return pair[1:-1]
-      return ""
-    else:
-      return ""
-
-  def parse_output(extracted_output):
-    """Parse the extracted output 'w, b' string to np.array([w, b]).
-
-    Args:
-      extracted_output (str): the extracted output string, like '1.5, 2.5'.
-
-    Returns:
-      parsed_output (np.array): the parsed output in a numpy array, like [1.5,
-      2.5].
-    """
-    if not extracted_output:
-      return
-    extracted_values = []
-    for item in extracted_output.split(","):
-      if "=" in item:
-        item = item[item.index("=") + 1 :]
-      extracted_values.append(item.strip())
-    parsed_output = np.array(extracted_values).astype(float)
-    return parsed_output
-
-  configs_dict = dict()
-  results_dict = dict()
-  num_convergence_steps = []
-  for i_rep in range(num_reps):
-    found_optimal = False
-    print(f"\nRep {i_rep}:")
-
-    # ================= generate the ground truth X, y =====================
-    X = np.arange(num_points).astype(float) + 1  # pylint: disable=invalid-name
-    np.random.seed(i_rep + 1)
-    y = X * w_true + b_true + np.random.randn(num_points)
-    loss_at_true_values = evaluate_loss(X, y, w_true, b_true)
-    print(f"value at (w_true, b_true): {loss_at_true_values}")
-
-    # ================= generate the starting points =====================
-    num_starting_points = 5  # the number of initial points for optimization
-    np.random.seed((i_rep + 1) * 10)
-    init_w = np.random.uniform(low=10, high=20, size=num_starting_points)
-    np.random.seed((i_rep + 1) * 100)
-    init_b = np.random.uniform(low=10, high=20, size=num_starting_points)
-
-    # ====================== run optimization ============================
-    configs_dict_single_rep = {
-        "optimizer_llm_configs": optimizer_llm_dict,
-        "data": {
-            "num_points": num_points,
-            "w_true": w_true,
-            "b_true": b_true,
-            "loss_at_true_values": loss_at_true_values,
-            "X": list(X),
-            "y": list(y),
-        },
-        "init_w": list(init_w),
-        "init_b": list(init_b),
-        "max_num_steps": max_num_steps,
-        "max_num_pairs": max_num_pairs,
-        "num_input_decimals": num_input_decimals,
-        "num_output_decimals": num_output_decimals,
-        "num_generated_points_in_each_step": num_generated_points_in_each_step,
-    }
-    configs_dict[i_rep] = configs_dict_single_rep
-    configs_json_path = os.path.join(save_folder, "configs.json")
-    print(f"saving configs to\n{configs_json_path}")
-    with open(configs_json_path, "w") as f:
-      json.dump(configs_dict, f, indent=4)
-
-    old_value_pairs_set = set()
-    old_value_pairs_with_i_step = []  # format: [(w, b, z = f(w, b), i_step)]
-    meta_prompts_dict = dict()  # format: {i_step: meta_prompt}
-    raw_outputs_dict = dict()  # format: {i_step: raw_outputs}
-
-    rounded_inits = [
-        (np.round(w, num_input_decimals), np.round(b, num_input_decimals))
-        for w, b in zip(init_w, init_b)
-    ]
-    rounded_inits = [
-        tuple(item) for item in list(np.unique(rounded_inits, axis=0))
-    ]
-    for w, b in rounded_inits:
-      z = evaluate_loss(X, y, w, b)
-      old_value_pairs_set.add((w, b, z))
-      old_value_pairs_with_i_step.append((w, b, z, -1))
-
-    print("\n================ run optimization ==============")
-    print(
-        f"initial points: {[tuple(item[:2]) for item in old_value_pairs_set]}"
-    )
-    print(f"initial values: {[item[-1] for item in old_value_pairs_set]}")
-    results_json_path = os.path.join(save_folder, "results.json")
-    print(f"saving results to\n{results_json_path}")
-
-    for i_step in range(max_num_steps):
-      print(f"\nStep {i_step}:")
-      meta_prompt = gen_meta_prompt(
-          old_value_pairs_set,
-          X,
-          y,
-          num_input_decimals=num_input_decimals,
-          num_output_decimals=num_output_decimals,
-          max_num_pairs=max_num_pairs,
-      )
-      if not i_step % 5:
-        print("\n=================================================")
-        print(f"meta_prompt:\n{meta_prompt}")
-      meta_prompts_dict[i_step] = meta_prompt
-
-      # generate a maximum of the given number of points in each step
-      remaining_num_points_to_generate = num_generated_points_in_each_step
-      raw_outputs = []
-      while remaining_num_points_to_generate > 0:
-        raw_outputs += call_optimizer_server_func(meta_prompt)
-        remaining_num_points_to_generate -= optimizer_llm_dict["batch_size"]
-      raw_outputs = raw_outputs[:num_generated_points_in_each_step]
-
-      raw_outputs_dict[i_step] = raw_outputs
-      parsed_outputs = []
-      for string in raw_outputs:
-        if not i_step % 5:
-          print("\n=================================================")
-          print("raw output:\n", string)
-          print("\n=================================================")
-        try:
-          parsed_output = parse_output(
-              extract_string_in_square_brackets(string)
-          )
-          if parsed_output is not None and len(parsed_output) == 2:
-            parsed_outputs.append(parsed_output)
-        except ValueError:
-          pass
-      parsed_outputs = [tuple(item) for item in parsed_outputs]
-      print(f"proposed points before rounding: {parsed_outputs}")
-
-      # round the proposed points to the number of decimals in meta-prompt
-      rounded_outputs = [
-          (np.round(w, num_input_decimals), np.round(b, num_input_decimals))
-          for w, b in parsed_outputs
-      ]
-      rounded_outputs = [
-          tuple(item) for item in list(np.unique(rounded_outputs, axis=0))
-      ]
-      print(f"proposed points after rounding: {rounded_outputs}")
-
-      # evaluate the values of proposed and rounded outputs
-      single_step_values = []
-      for w, b in rounded_outputs:
-        if w == w_true and b == b_true:
-          found_optimal = True
-        z = evaluate_loss(X, y, w, b)
-        single_step_values.append(z)
-        old_value_pairs_set.add((w, b, z))
-        old_value_pairs_with_i_step.append((w, b, z, i_step))
-      print(f"single_step_values: {single_step_values}")
-
-      # ====================== save results ============================
-      results_dict_single_rep = {
-          "meta_prompts": meta_prompts_dict,
-          "raw_outputs": raw_outputs_dict,
-          "old_value_pairs_with_i_step": old_value_pairs_with_i_step,
-      }
-      results_dict[i_rep] = results_dict_single_rep
-      with open(results_json_path, "w") as f:
-        json.dump(results_dict, f, indent=4)
-      if found_optimal:
-        print(
-            f"Repetition {i_rep+1}, optimal found at Step {i_step+1}, saving"
-            f" final results to\n{save_folder}"
-        )
-        num_convergence_steps.append(i_step + 1)
-        break
-  print(f"num_convergence_steps: {num_convergence_steps}")
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/optimization/optimize_tsp.py b/optimization/optimize_tsp.py
deleted file mode 100644
index a8f99d9..0000000
--- a/optimization/optimize_tsp.py
+++ /dev/null
@@ -1,430 +0,0 @@
-# Copyright 2024 The OPRO Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-r"""Optimize over the objective function of a traveling salesman problem.
-
-Usage:
-
-```
-python optimize_tsp.py --optimizer="text-bison"
-```
-
-Note:
-- When using a Google-Cloud-served model (like text-bison at
-https://developers.generativeai.google/tutorials/text_quickstart), add
-`--palm_api_key="<your_key>"`
-- When using an OpenAI model, add `--openai_api_key="<your_key>"`
-"""
-
-import datetime
-import functools
-import getpass
-import json
-import os
-import re
-import sys
-import itertools
-
-OPRO_ROOT_PATH = os.path.dirname(
-    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-)
-sys.path.insert(0, OPRO_ROOT_PATH)
-
-from absl import app
-from absl import flags
-import google.generativeai as palm
-import numpy as np
-import openai
-
-from opro import prompt_utils
-
-
-_OPENAI_API_KEY = flags.DEFINE_string(
-    "openai_api_key", "", "The OpenAI API key."
-)
-
-_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
-
-_OPTIMIZER = flags.DEFINE_string(
-    "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
-)
-
-_START_ALGORITHM = flags.DEFINE_string(
-    "starting_algorithm", "farthest_insertion", "The name of the starting algorithm. Select from [dp, nearest_neighbor, farthest_insertion]"
-)
-
-def main(_):
-  # ============== set optimization experiment configurations ================
-  num_points = 100  # number of points in TSP
-  num_steps = 500  # the number of optimization steps
-  max_num_pairs = 10  # the maximum number of input-output pairs in meta-prompt
-  num_decimals = 0  # num of decimals for distances in meta-prompt
-  num_starting_points = 5  # the number of initial points for optimization
-  num_decode_per_step = 8 # the number of decoded solutions per step
-
-  # ================ load LLM settings ===================
-  optimizer_llm_name = _OPTIMIZER.value
-  assert optimizer_llm_name in {
-      "text-bison",
-      "gpt-3.5-turbo",
-      "gpt-4",
-  }
-  openai_api_key = _OPENAI_API_KEY.value
-  palm_api_key = _PALM_API_KEY.value
-
-  if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
-    assert openai_api_key, "The OpenAI API key must be provided."
-    openai.api_key = openai_api_key
-  else:
-    assert optimizer_llm_name == "text-bison"
-    assert (
-        palm_api_key
-    ), "A PaLM API key is needed when prompting the text-bison model."
-    palm.configure(api_key=palm_api_key)
-
-  # =================== create the result directory ==========================
-  datetime_str = (
-      str(datetime.datetime.now().replace(microsecond=0))
-      .replace(" ", "-")
-      .replace(":", "-")
-  )
-
-  save_folder = os.path.join(
-      OPRO_ROOT_PATH,
-      "outputs",
-      "optimization-results",
-      f"tsp-o-{optimizer_llm_name}-{datetime_str}/",
-  )
-  os.makedirs(save_folder)
-  print(f"result directory:\n{save_folder}")
-
-  # ====================== optimizer model configs ============================
-  if optimizer_llm_name.lower() == "text-bison":
-    # when prompting text-bison with Cloud API
-    optimizer_finetuned_palm_temperature = 1.0
-    optimizer_finetuned_palm_max_decode_steps = 1024
-    optimizer_finetuned_palm_batch_size = 1
-    optimizer_finetuned_palm_num_servers = 1
-    optimizer_finetuned_palm_dict = dict()
-    optimizer_finetuned_palm_dict["temperature"] = (
-        optimizer_finetuned_palm_temperature
-    )
-    optimizer_finetuned_palm_dict["batch_size"] = (
-        optimizer_finetuned_palm_batch_size
-    )
-    optimizer_finetuned_palm_dict["num_servers"] = (
-        optimizer_finetuned_palm_num_servers
-    )
-    optimizer_finetuned_palm_dict["max_decode_steps"] = (
-        optimizer_finetuned_palm_max_decode_steps
-    )
-
-    call_optimizer_finetuned_palm_server_func = functools.partial(
-        prompt_utils.call_palm_server_from_cloud,
-        # prompt_utils.call_vllm,
-        model="text-bison-001",
-        temperature=optimizer_finetuned_palm_dict["temperature"],
-        max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
-    )
-
-    optimizer_llm_dict = {
-        "model_type": optimizer_llm_name.lower(),
-    }
-    optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
-    call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
-
-  else:
-    assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
-    optimizer_gpt_max_decode_steps = 1024
-    optimizer_gpt_temperature = 1.0
-
-    optimizer_llm_dict = dict()
-    optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps
-    optimizer_llm_dict["temperature"] = optimizer_gpt_temperature
-    optimizer_llm_dict["batch_size"] = 1
-    call_optimizer_server_func = functools.partial(
-        prompt_utils.call_openai_server_func,
-        model=optimizer_llm_name,
-        max_decode_steps=optimizer_gpt_max_decode_steps,
-        temperature=optimizer_gpt_temperature,
-    )
-
-  # ====================== try calling the servers ============================
-  print("\n======== testing the optimizer server ===========")
-  optimizer_test_output = call_optimizer_server_func(
-      "Does the sun rise from the north? Just answer yes or no.",
-      temperature=1.0,
-  )
-  print(f"optimizer test output: {optimizer_test_output}")
-  print("Finished testing the optimizer server.")
-  print("\n=================================================")
-
-  # ====================== utility functions ============================
-  def evaluate_distance(x, y, trace, num_decimals):  # pylint: disable=invalid-name
-    dis = 0
-    try:
-      for i in range(len(trace) - 1):
-        id0 = trace[i]
-        id1 = trace[i + 1]
-        dis += np.sqrt((x[id0] - x[id1]) ** 2 + (y[id0] - y[id1]) ** 2)
-    except:
-      return -1
-    id0 = trace[-1]
-    id1 = trace[0]
-    dis += np.sqrt((x[id0] - x[id1]) ** 2 + (y[id0] - y[id1]) ** 2)
-    dis = np.round(dis, num_decimals) if num_decimals > 0 else int(dis)
-    return dis
-
-  def solve_tsp(x, y, num_points, num_decimals, starting_algorithm):
-    if starting_algorithm == "nearest_neighbor":
-      min_dis = 0
-      gt_sol = [0]
-      remaining_points = list(range(1, num_points))
-      while len(remaining_points) > 0:
-        min_p = -1
-        min_cur_dis = -1
-        for p in remaining_points:
-          cur_dis = np.sqrt((x[p] - x[gt_sol[-1]]) ** 2 + (y[p] - y[gt_sol[-1]]) ** 2)
-          if min_p == -1 or cur_dis < min_cur_dis:
-            min_p = p
-            min_cur_dis = cur_dis
-        gt_sol.append(min_p)
-        min_dis += min_cur_dis
-        remaining_points.remove(min_p)
-      min_dis += np.sqrt((x[0] - x[gt_sol[-1]]) ** 2 + (y[0] - y[gt_sol[-1]]) ** 2)
-      min_dis = np.round(min_dis, num_decimals) if num_decimals > 0 else int(min_dis)
-      return gt_sol, min_dis
-    elif starting_algorithm == "farthest_insertion":
-      gt_sol = [0]
-      remaining_points = list(range(1, num_points))
-      while len(remaining_points) > 0:
-        max_p = -1
-        max_cur_dis = -1
-        max_cur_index = -1
-        for p in remaining_points:
-          min_cur_dis = -1
-          min_cur_index = -1
-          for index in range(1, len(gt_sol) + 1):
-            new_sol = gt_sol[:index] + [p] + gt_sol[index:]
-            cur_dis = evaluate_distance(x, y, new_sol, num_decimals)
-            if min_cur_dis == -1 or cur_dis < min_cur_dis:
-              min_cur_dis = cur_dis
-              min_cur_index = index
-          if max_cur_dis == -1 or min_cur_dis > max_cur_dis:
-            max_p = p
-            max_cur_dis = min_cur_dis
-            max_cur_index = min_cur_index
-        gt_sol = gt_sol[:max_cur_index] + [max_p] + gt_sol[max_cur_index:]
-        remaining_points.remove(max_p)
-      min_dis = evaluate_distance(x, y, gt_sol, num_decimals)
-      return gt_sol, min_dis
-      
-    f = {(0, 1): (0, [0])}
-    q = [(0, 1)]
-    min_dis = -1
-    gt_sol = list(range(num_points))
-    while len(q) > 0:
-      p, status = q[0]
-      q = q[1:]
-      for i in range(num_points):
-        if 2 << i >> 1 & status == 0:
-          new_status = status + (2 << i >> 1)
-          new_dis = f[(p, status)][0] + np.sqrt((x[i] - x[p]) ** 2 + (y[i] - y[p]) ** 2)
-          if (i, new_status) not in f or new_dis < f[(i, new_status)][0]:
-            f[(i, new_status)] = (new_dis, f[(p, status)][1] + [i])
-            if new_status == (2 << num_points >> 1) - 1:
-              new_dis += np.sqrt((x[i] - x[0]) ** 2 + (y[i] - y[0]) ** 2)
-              if min_dis == -1 or new_dis < min_dis:
-                min_dis = new_dis
-                gt_sol = f[(i, new_status)][1][:]
-            elif (i, new_status) not in q:
-              q.append((i, new_status))
-    min_dis = np.round(min_dis, num_decimals) if num_decimals > 0 else int(min_dis)
-    return gt_sol, min_dis
-
-  def gen_meta_prompt(
-      old_value_pairs_set,
-      x,  # pylint: disable=invalid-name
-      y,
-      max_num_pairs=100,
-  ):
-    """Generate the meta-prompt for optimization.
-
-    Args:
-     old_value_pairs_set (set): the set of old traces.
-     X (np.array): the 1D array of x values.
-     y (np.array): the 1D array of y values.
-     num_decimals (int): the number of decimals in the
-       meta-prompt.
-     max_num_pairs (int): the maximum number of exemplars in the meta-prompt.
-
-    Returns:
-      meta_prompt (str): the generated meta-prompt.
-    """
-    old_value_pairs = list(old_value_pairs_set)
-    old_value_pairs = sorted(old_value_pairs, key=lambda x: -x[1])[
-        -max_num_pairs:
-    ]
-    old_value_pairs_substr = ""
-    for trace, dis in old_value_pairs:
-      old_value_pairs_substr += f"\n<trace> {trace} </trace>\nlength:\n{dis}\n"
-    meta_prompt = "You are given a list of points with coordinates below:\n"
-    for i, (xi, yi) in enumerate(zip(x, y)):
-      if i:
-        meta_prompt += ", "
-      meta_prompt += f"({i}): ({xi}, {yi})"
-    meta_prompt += ".\n\nBelow are some previous traces and their lengths. The traces are arranged in descending order based on their lengths, where lower values are better.".strip()
-    meta_prompt += "\n\n"
-    meta_prompt += old_value_pairs_substr.strip()
-    meta_prompt += "\n\n"
-    meta_prompt += """Give me a new trace that is different from all traces above, and has a length lower than any of the above. The trace should traverse all points exactly once. The trace should start with '<trace>' and end with </trace>.
-    """.strip()
-    return meta_prompt
-
-  def extract_string(input_string):
-    start_string = "<trace>"
-    end_string = "</trace>"
-    if start_string not in input_string:
-      return ""
-    input_string = input_string[input_string.index(start_string) + len(start_string):]
-    if end_string not in input_string:
-      return ""
-    input_string = input_string[:input_string.index(end_string)]
-    parsed_list = []
-    for p in input_string.split(","):
-      p = p.strip()
-      try:
-        p = int(p)
-      except:
-        continue
-      parsed_list.append(p)
-    return parsed_list
-
-  # ================= generate the ground truth trace =====================
-
-  x = np.random.uniform(low=-100, high=100, size=num_points)
-  y = np.random.uniform(low=-100, high=100, size=num_points)
-  x = [np.round(xi, num_decimals) if num_decimals > 0 else int(xi) for xi in x]
-  y = [np.round(yi, num_decimals) if num_decimals > 0 else int(yi) for yi in y]
-
-  starting_algorithm = _START_ALGORITHM.value
-  
-  gt_sol, min_dis = solve_tsp(x, y, num_points, num_decimals, starting_algorithm)
-  print("ground truth solution" + str(gt_sol))
-  print("min distance: ", min_dis)
-  gt_sol_str = ",".join([str(i) for i in gt_sol])
-  point_list = range(num_points)
-  init_sols = []
-  while len(init_sols) < num_starting_points:
-    sol = np.random.permutation(point_list)
-    if sol[0] != 0:
-      continue
-    sol_str = ",".join([str(i) for i in sol])
-    if sol_str == gt_sol_str:
-      continue
-    init_sols.append(list(sol))
-
-  # ====================== run optimization ============================
-  configs_dict = {
-      "num_starting_points": num_starting_points,
-      "num_decode_per_step": num_decode_per_step,
-      "optimizer_llm_configs": optimizer_llm_dict,
-      "data": {
-          "ground truth solution": [",".join([str(i) for i in gt_sol])],
-          "loss_at_true_values": min_dis,
-          "x": list(x),
-          "y": list(y),
-      },
-      "init_sols": [",".join([str(i) for i in sol]) for sol in init_sols],
-      "num_steps": num_steps,
-      "max_num_pairs": max_num_pairs,
-      "num_decimals": num_decimals,
-  }
-  configs_json_path = os.path.join(save_folder, "configs.json")
-  print(f"saving configs to\n{configs_json_path}")
-  with open(configs_json_path, "w") as f:
-    json.dump(configs_dict, f, indent=4)
-
-  old_value_pairs_set = set()
-  old_value_pairs_with_i_step = []  # format: [(trace, dis = f(trace), i_step)]
-  meta_prompts_dict = dict()  # format: {i_step: meta_prompt}
-  raw_outputs_dict = dict()  # format: {i_step: raw_outputs}
-
-  for sol in init_sols:
-    dis = evaluate_distance(x, y, sol, num_decimals)
-    sol_str = ",".join([str(i) for i in sol])
-    old_value_pairs_set.add((sol_str, dis))
-    old_value_pairs_with_i_step.append((sol_str, dis, -1))
-
-  print("\n================ run optimization ==============")
-  print(f"initial points: {[tuple(item[:-1]) for item in old_value_pairs_set]}")
-  print(f"initial values: {[item[-1] for item in old_value_pairs_set]}")
-  results_json_path = os.path.join(save_folder, "results.json")
-  print(f"saving results to\n{results_json_path}")
-
-  for i_step in range(num_steps):
-    print(f"\nStep {i_step}:")
-    meta_prompt = gen_meta_prompt(
-        old_value_pairs_set,
-        x,
-        y,
-        max_num_pairs=max_num_pairs,
-    )
-    print("\n=================================================")
-    print(f"meta_prompt:\n{meta_prompt}")
-    meta_prompts_dict[i_step] = meta_prompt
-    raw_outputs = []
-    parsed_outputs = []
-    while len(parsed_outputs) < num_decode_per_step:
-      raw_output = call_optimizer_server_func(meta_prompt)
-      for string in raw_output:
-        print("\n=================================================")
-        print("raw output:\n", string)
-        try:
-          parsed_output = extract_string(string)
-          if parsed_output is not None and len(set(parsed_output)) == num_points and len(parsed_output) == num_points and parsed_output[0] == 0:
-            dis = evaluate_distance(x, y, parsed_output, num_decimals)
-            if dis == -1:
-              continue
-            parsed_outputs.append(parsed_output)
-            raw_outputs.append(string)
-        except:
-          pass
-    print("\n=================================================")
-    print(f"proposed points: {parsed_outputs}")
-    raw_outputs_dict[i_step] = raw_outputs
-
-    # evaluate the values of proposed and rounded outputs
-    single_step_values = []
-    for trace in parsed_outputs:
-      dis = evaluate_distance(x, y, trace, num_decimals)
-      single_step_values.append(dis)
-      trace_str = ",".join([str(i) for i in trace])
-      old_value_pairs_set.add((trace_str, dis))
-      old_value_pairs_with_i_step.append((trace_str, dis, i_step))
-    print(f"single_step_values: {single_step_values}")
-    print("ground truth solution" + str(gt_sol))
-    print("min distance: ", min_dis)
-
-    # ====================== save results ============================
-    results_dict = {
-        "meta_prompts": meta_prompts_dict,
-        "raw_outputs": raw_outputs_dict,
-        "old_value_pairs_with_i_step": old_value_pairs_with_i_step,
-    }
-    with open(results_json_path, "w") as f:
-      json.dump(results_dict, f, indent=4)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/optimization/test.py b/optimization/test.py
deleted file mode 100644
index a38abcc..0000000
--- a/optimization/test.py
+++ /dev/null
@@ -1,967 +0,0 @@
-# Copyright 2023 The OPRO Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-r"""The .py file for prompt optimization.
-
-Usage:
-
-Step 1: edit the starting instructions by modifying `initial_instructions`
-
-Step 2: edit the training ratio by modifying `train_ratio`
-
-Step 3: check if the model configs (like batch size) are the same as the actual serving configs
-
-Step 4: run
-
-```
-python optimize_instructions.py \
-    --optimizer="gpt-3.5-turbo" --scorer="text-bison" \
-    --instruction_pos="A_begin" --dataset="gsm8k" --task="train"
-```
-
-The outputs will then be written to `outputs/optimization-results/` in the opro folder.
-
-Notes:
-
-1. One or more API keys may need to be provided:
-- When using a Google-Cloud-served model (like text-bison at https://developers.generativeai.google/tutorials/text_quickstart), add `--palm_api_key=<your_key>`
-- When using an OpenAI model, add `--openai_api_key=”<your_key>”`
-
-2. The initial instructions should be provided in the "initial_instructions"
-variable.
-"""
-
-import datetime
-import functools
-import os
-import sys
-
-OPRO_ROOT_PATH = os.path.dirname(
-    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-)
-sys.path.insert(0, OPRO_ROOT_PATH)
-
-from absl import app
-from absl import flags
-import google.generativeai as palm
-import numpy as np
-import openai
-from opro import prompt_utils
-from opro.optimization import opt_utils
-import pandas as pd
-
-ROOT_DATA_FOLDER_PATH = os.path.join(OPRO_ROOT_PATH, "data")
-
-_LOCAL_MODEL_PATH = flags.DEFINE_string("local_model_path", None, "Path to local vLLM model.")
-
-
-_OPENAI_API_KEY = flags.DEFINE_string(
-    "openai_api_key", "", "The OpenAI API key."
-)
-
-_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
-
-_SCORER = flags.DEFINE_string(
-    "scorer", "text-bison", "The name of the scorer LLM."
-)
-
-_OPTIMIZER = flags.DEFINE_string(
-    "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
-)
-
-_DATASET = flags.DEFINE_string(
-    "dataset", "gsm8k", "The name of dataset to search for instructions on."
-)
-
-_TASK = flags.DEFINE_string(
-    "task",
-    "train",
-    "The name of task within the above dataset to search for instructions on.",
-)
-
-_INSTRUCTION_POS = flags.DEFINE_string(
-    "instruction_pos",
-    "A_begin",
-    "The position of the instruction to search for.",
-)
-
-_META_PROMPT_TYPE = flags.DEFINE_string(
-    "meta_prompt_type",
-    "both_instructions_and_exemplars",
-    "The type of meta-prompt: whether to have both previous instructions and"
-    " dataset exemplars (often for fine-tuned optimizers), or to have only"
-    " previous instructions (often for pre-trained optimizers).",
-)
-
-
-def main(_):
-  openai_api_key = _OPENAI_API_KEY.value
-  palm_api_key = _PALM_API_KEY.value
-  scorer_llm_name = _SCORER.value
-  optimizer_llm_name = _OPTIMIZER.value
-  dataset_name = _DATASET.value.lower()
-  task_name = _TASK.value
-  meta_prompt_type = _META_PROMPT_TYPE.value
-  local_model_path = _LOCAL_MODEL_PATH.value
-
-  assert dataset_name in {
-      "mmlu",
-      "bbh",
-      "gsm8k",
-  }, "The lower-case dataset name must be one of mmlu, bbh, or gsm8k."
-  if dataset_name == "mmlu":
-    assert task_name in {
-        "STEM",
-        "humanities",
-        "social sciences",
-        "otheran (business, health, misc.)",
-    }  # for now only support searching on one MMLU category
-  elif dataset_name == "bbh":
-    assert task_name in {
-        "boolean_expressions",
-        "causal_judgement",
-        "date_understanding",
-        "disambiguation_qa",
-        "dyck_languages",
-        "formal_fallacies",
-        "geometric_shapes",
-        "hyperbaton",
-        "logical_deduction_five_objects",
-        "logical_deduction_seven_objects",
-        "logical_deduction_three_objects",
-        "movie_recommendation",
-        "multistep_arithmetic_two",
-        "navigate",
-        "object_counting",
-        "penguins_in_a_table",
-        "reasoning_about_colored_objects",
-        "ruin_names",
-        "salient_translation_error_detection",
-        "snarks",
-        "sports_understanding",
-        "temporal_sequences",
-        "tracking_shuffled_objects_five_objects",
-        "tracking_shuffled_objects_seven_objects",
-        "tracking_shuffled_objects_three_objects",
-        "web_of_lies",
-        "word_sorting",
-    }
-  else:
-    assert dataset_name == "gsm8k"
-    assert task_name in {"train", "test"}
-
-  assert scorer_llm_name in {
-      "text-bison",
-      "gpt-3.5-turbo",
-      "gpt-4",
-        "local",
-  }
-  assert optimizer_llm_name in {
-      "text-bison",
-      "gpt-3.5-turbo",
-      "gpt-4",
-        "local",
-  }
-  assert meta_prompt_type in {
-      "both_instructions_and_exemplars",
-      "instructions_only",
-  }
-
-  instruction_pos = _INSTRUCTION_POS.value
-  assert instruction_pos in {
-      "before_Q",
-      "Q_begin",
-      "Q_end",
-      "A_begin",
-  }, (
-      "The instruction position should be either before the question, or at the"
-      " beginning of the question, at the end of the question, or at the"
-      " beginning of the answer."
-  )
-  print(
-      f"scorer: {scorer_llm_name}, optimizer: {optimizer_llm_name}, dataset:"
-      f" {dataset_name}, task: {task_name}, instruction_pos: {instruction_pos}"
-  )
-
-  # make sure the scorer and optimizer models are callable
-
-  if scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
-    assert openai_api_key, "The OpenAI API key must be provided."
-    openai.api_key = openai_api_key
-  elif scorer_llm_name == "text-bison":
-    assert scorer_llm_name == "text-bison"
-    assert (
-        palm_api_key
-    ), "A PaLM API key is needed when prompting the text-bison model."
-    palm.configure(api_key=palm_api_key)
-  
-  elif scorer_llm_name == "local":
-    assert local_model_path, "The local model path must be provided."
-    assert os.path.exists(local_model_path), (
-        f"The local model path {local_model_path} does not exist."
-    )
-    # set the local model path for vLLM
-    # prompt_utils.call_local_server_func(local_model_path)
-  else:
-    raise ValueError(
-        f"Unknown scorer_llm_name: {scorer_llm_name}. "
-        "It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local."
-    )
-
-  if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
-    assert openai_api_key, "The OpenAI API key must be provided."
-    openai.api_key = openai_api_key
-  elif optimizer_llm_name == "text-bison":
-    assert optimizer_llm_name == "text-bison"
-    assert (
-        palm_api_key
-    ), "A PaLM API key is needed when prompting the text-bison model."
-    palm.configure(api_key=palm_api_key)
-  
-  elif optimizer_llm_name == "local":
-    assert local_model_path, "The local model path must be provided."
-    assert os.path.exists(local_model_path), (
-        f"The local model path {local_model_path} does not exist."
-    )
-    # set the local model path for vLLM
-    # prompt_utils.call_local_server_func(local_model_path)
-  else:
-    raise ValueError(
-        f"Unknown scorer_llm_name: {optimizer_llm_name}. "
-        "It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local."
-    )
-
-  
-  if dataset_name == "mmlu":
-    root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "MMLU-data")
-  elif dataset_name == "bbh":
-    root_data_folder_path = os.path.join(
-        ROOT_DATA_FOLDER_PATH, "BIG-Bench-Hard-data/"
-    )
-  else:
-    assert dataset_name == "gsm8k"
-    root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "gsm_data")
-
-  # =================== create the result directory ==========================
-  datetime_str = (
-      str(datetime.datetime.now().replace(microsecond=0))
-      .replace(" ", "-")
-      .replace(":", "-")
-  )
-
-  save_folder = os.path.join(
-      OPRO_ROOT_PATH,
-      "outputs",
-      "optimization-results",
-      f"{dataset_name.upper()}-{task_name}-s-{scorer_llm_name}-o-{optimizer_llm_name}-{datetime_str}/",
-  )
-  result_by_instruction_folder = os.path.join(
-      save_folder, "result_by_instruction"
-  )
-  os.makedirs(result_by_instruction_folder)
-  print(f"result directory:\n{save_folder}")
-
-  # ====================== scorer model configs ==============================
-  # difference between num_decodes and batch_size:
-  # - num_decodes: how many outputs we actually want for each input
-  # - batch_size: the batch size in model serving, should equal to that in
-  # model serving config
-  # 常量定义
-  DEFAULT_MAX_TOKENS = 1024
-  DEFAULT_TEMPERATURE = 0.0
-  PALM_MODEL_NAME = "text-bison-001"
-
-  if scorer_llm_name == "text-bison":
-    config = {
-        "temperature": DEFAULT_TEMPERATURE,
-        "max_decode_steps": DEFAULT_MAX_TOKENS,
-        "batch_size": 1,
-        "num_servers": 1,
-    }
-    call_scorer_server_func = functools.partial(
-        prompt_utils.call_palm_server_from_cloud,
-        model=PALM_MODEL_NAME,
-        **config
-    )
-    scorer_llm_dict = {"model_type": "text-bison", **config}
-
-  elif scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
-    config = {
-        "temperature": DEFAULT_TEMPERATURE,
-        "max_decode_steps": DEFAULT_MAX_TOKENS,
-        "batch_size": 1,
-        "num_servers": 1,
-    }
-    call_scorer_server_func = functools.partial(
-        prompt_utils.call_openai_server_func,
-        model=scorer_llm_name.lower(),
-        **config
-    )
-    scorer_llm_dict = {"model_type": scorer_llm_name.lower(), **config}
-
-  elif scorer_llm_name == "local":
-    print(f"[DEBUG] local_model_path: {local_model_path}")
-    assert local_model_path, "Local model path must be provided."
-    config = {
-        "temperature": DEFAULT_TEMPERATURE,
-        "max_decode_steps": DEFAULT_MAX_TOKENS,
-        "batch_size": 8,
-        "num_servers": 8,# number of servers to use for local model
-    }
-    call_scorer_server_func = functools.partial(
-        prompt_utils.call_local_server_func,
-        local_model_path=local_model_path,
-        **config
-    )
-    scorer_llm_dict = {"model_type": "local", **config}
-
-  else:
-    raise ValueError(f"Unsupported model: {scorer_llm_name}")
-
-#   if scorer_llm_name == "text-bison":
-#     # when prompting text-bison with Cloud API
-#     scorer_finetuned_palm_temperature = 0.0
-#     scorer_finetuned_palm_max_decode_steps = 1024
-#     scorer_finetuned_palm_batch_size = 1
-#     scorer_finetuned_palm_num_servers = 1
-#     scorer_finetuned_palm_dict = dict()
-#     scorer_finetuned_palm_dict["temperature"] = (
-#         scorer_finetuned_palm_temperature
-#     )
-#     scorer_finetuned_palm_dict["num_servers"] = (
-#         scorer_finetuned_palm_num_servers
-#     )
-#     scorer_finetuned_palm_dict["batch_size"] = scorer_finetuned_palm_batch_size
-#     scorer_finetuned_palm_dict["max_decode_steps"] = (
-#         scorer_finetuned_palm_max_decode_steps
-#     )
-
-#     call_scorer_finetuned_palm_server_func = functools.partial(
-#         prompt_utils.call_palm_server_from_cloud,
-#         model="text-bison-001",
-#         temperature=scorer_finetuned_palm_dict["temperature"],
-#         max_decode_steps=scorer_finetuned_palm_dict["max_decode_steps"],
-#     )
-
-#     scorer_llm_dict = {
-#         "model_type": scorer_llm_name.lower(),
-#     }
-#     scorer_llm_dict.update(scorer_finetuned_palm_dict)
-#     call_scorer_server_func = call_scorer_finetuned_palm_server_func
-
-#   elif scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
-#     # assert scorer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4"}
-#     scorer_gpt_max_decode_steps = 1024
-#     scorer_gpt_temperature = 0.0
-
-#     scorer_gpt_dict = dict()
-#     scorer_gpt_dict["max_decode_steps"] = scorer_gpt_max_decode_steps
-#     scorer_gpt_dict["temperature"] = scorer_gpt_temperature
-#     scorer_gpt_dict["num_decodes"] = 1
-#     scorer_gpt_dict["batch_size"] = 1
-#     scorer_gpt_dict["num_servers"] = 1
-
-#     scorer_llm_dict = {
-#         "model_type": scorer_llm_name.lower(),
-#     }
-#     scorer_llm_dict.update(scorer_gpt_dict)
-#     call_scorer_server_func = functools.partial(
-#         prompt_utils.call_openai_server_func,
-#         model=scorer_llm_name.lower(),
-#         max_decode_steps=scorer_gpt_max_decode_steps,
-#         temperature=scorer_gpt_temperature,
-#     )
-#     elif scorer_llm_name == "local":
-#     # local vLLM model
-#     scorer_local_max_decode_steps = 1024
-#     scorer_local_temperature = 0.0
-#     call_scorer_server_func = functools.partial(
-#         prompt_utils.call_local_model_server_func,
-#         model_path=local_model_path,
-#         max_decode_steps=scorer_local_max_decode_steps,
-#         temperature=scorer_local_temperature,
-#     )
-#     else:
-#         raise ValueError(
-#         f"Unknown scorer_llm_name: {scorer_llm_name}. "
-#         "It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local."
-#     )
-
-  # ====================== optimizer model configs ============================
-  if optimizer_llm_name.lower() == "text-bison":
-    # PaLM text-bison 模型配置
-    optimizer_llm_dict = {
-        "model_type": "text-bison",
-        "temperature": 1.0,          # 更高的随机性以生成多样化解
-        "max_decode_steps": 1024,    # 最大生成长度
-        "batch_size": 1,             # 单样本处理
-        "num_decodes": 8,            # 生成8个候选结果
-        "num_servers": 1             # 单服务器
-    }
-    
-    call_optimizer_server_func = functools.partial(
-        prompt_utils.call_palm_server_from_cloud,
-        model="text-bison-001",
-        temperature=optimizer_llm_dict["temperature"],
-        max_decode_steps=optimizer_llm_dict["max_decode_steps"],
-    )
-
-  elif optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4"}:
-    # GPT 模型配置
-    optimizer_llm_dict = {
-        "model_type": optimizer_llm_name.lower(),
-        "temperature": 1.0,          # 更高的随机性
-        "max_decode_steps": 512,      # 较短的最大长度
-        "batch_size": 1,
-        "num_decodes": 1 ,             # 单次生成
-         "num_servers": 1             # 单服务器
-    }
-    
-    call_optimizer_server_func = functools.partial(
-        prompt_utils.call_openai_server_func,
-        model=optimizer_llm_name,
-        max_decode_steps=optimizer_llm_dict["max_decode_steps"],
-        temperature=optimizer_llm_dict["temperature"],
-    )
-  elif optimizer_llm_name.lower() == "local":
-    assert local_model_path, "Local model path must be provided."
-    optimizer_llm_dict = {
-        "model_type": optimizer_llm_name.lower(),
-        "temperature": 1.0,          # 更高的随机性
-        "max_decode_steps": 512,      # 较短的最大长度
-        "batch_size": 8,
-        "num_decodes": 1 ,             # 单次生成
-        "num_servers": 8             # 单服务器
-    }
-    call_optimizer_server_func = functools.partial(
-        prompt_utils.call_local_server_func,
-        local_model_path=local_model_path,
-        max_decode_steps=optimizer_llm_dict["max_decode_steps"],
-        temperature=optimizer_llm_dict["temperature"],
-    )
-
-  else:
-    raise ValueError(
-        f"Unsupported optimizer model: {optimizer_llm_name}. "
-        "Must be one of: text-bison, gpt-3.5-turbo, gpt-4"
-    )
-
-#   if optimizer_llm_name.lower() == "text-bison":
-#     # when prompting text-bison with Cloud API
-#     optimizer_finetuned_palm_temperature = 1.0
-#     optimizer_finetuned_palm_num_decodes = 8
-#     optimizer_finetuned_palm_max_decode_steps = 1024
-#     optimizer_finetuned_palm_batch_size = 1
-#     optimizer_finetuned_palm_num_servers = 1
-#     optimizer_finetuned_palm_dict = dict()
-#     optimizer_finetuned_palm_dict["temperature"] = (
-#         optimizer_finetuned_palm_temperature
-#     )
-#     optimizer_finetuned_palm_dict["num_decodes"] = (
-#         optimizer_finetuned_palm_num_decodes
-#     )
-#     optimizer_finetuned_palm_dict["batch_size"] = (
-#         optimizer_finetuned_palm_batch_size
-#     )
-#     optimizer_finetuned_palm_dict["num_servers"] = (
-#         optimizer_finetuned_palm_num_servers
-#     )
-#     optimizer_finetuned_palm_dict["max_decode_steps"] = (
-#         optimizer_finetuned_palm_max_decode_steps
-#     )
-
-#     call_optimizer_finetuned_palm_server_func = functools.partial(
-#         prompt_utils.call_palm_server_from_cloud,
-#         model="text-bison-001",
-#         temperature=optimizer_finetuned_palm_dict["temperature"],
-#         max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
-#     )
-
-#     optimizer_llm_dict = {
-#         "model_type": optimizer_llm_name.lower(),
-#     }
-#     optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
-#     call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
-
-#   else:
-#     assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
-#     optimizer_gpt_max_decode_steps = 512
-#     optimizer_gpt_temperature = 1.0
-
-#     optimizer_llm_dict = dict()
-#     optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps
-#     optimizer_llm_dict["temperature"] = optimizer_gpt_temperature
-#     optimizer_llm_dict["batch_size"] = 1
-#     optimizer_llm_dict["num_decodes"] = 1
-#     call_optimizer_server_func = functools.partial(
-#         prompt_utils.call_openai_server_func,
-#         model=optimizer_llm_name,
-#         max_decode_steps=optimizer_gpt_max_decode_steps,
-#         temperature=optimizer_gpt_temperature,
-#     )
-
-  # ====================== try calling the servers ============================
-  print("\n======== testing the scorer and optimizer servers ===========")
-  scorer_test_output = call_scorer_server_func(
-      "Does the sun rise from the north? Just answer yes or no."
-  )
-  print(f"number of scorer output decodes: {len(scorer_test_output)}")
-  print(f"scorer test output: {scorer_test_output}")
-  optimizer_test_output = call_optimizer_server_func(
-      "Does the sun rise from the north? Just answer yes or no.",
-      temperature=1.0,
-  )
-  print(f"number of optimizer output decodes: {len(optimizer_test_output)}")
-  print(f"optimizer test output: {optimizer_test_output}")
-  print("Finished testing the servers.")
-
-  # ====================== read data ============================
-  print("\n================ prompt optimization settings ==============")
-  # from https://github.com/hendrycks/test/blob/master/categories.py
-  subcategories = {
-      "abstract_algebra": ["math"],
-      "anatomy": ["health"],
-      "astronomy": ["physics"],
-      "business_ethics": ["business"],
-      "clinical_knowledge": ["health"],
-      "college_biology": ["biology"],
-      "college_chemistry": ["chemistry"],
-      "college_computer_science": ["computer science"],
-      "college_mathematics": ["math"],
-      "college_medicine": ["health"],
-      "college_physics": ["physics"],
-      "computer_security": ["computer science"],
-      "conceptual_physics": ["physics"],
-      "econometrics": ["economics"],
-      "electrical_engineering": ["engineering"],
-      "elementary_mathematics": ["math"],
-      "formal_logic": ["philosophy"],
-      "global_facts": ["other"],
-      "high_school_biology": ["biology"],
-      "high_school_chemistry": ["chemistry"],
-      "high_school_computer_science": ["computer science"],
-      "high_school_european_history": ["history"],
-      "high_school_geography": ["geography"],
-      "high_school_government_and_politics": ["politics"],
-      "high_school_macroeconomics": ["economics"],
-      "high_school_mathematics": ["math"],
-      "high_school_microeconomics": ["economics"],
-      "high_school_physics": ["physics"],
-      "high_school_psychology": ["psychology"],
-      "high_school_statistics": ["math"],
-      "high_school_us_history": ["history"],
-      "high_school_world_history": ["history"],
-      "human_aging": ["health"],
-      "human_sexuality": ["culture"],
-      "international_law": ["law"],
-      "jurisprudence": ["law"],
-      "logical_fallacies": ["philosophy"],
-      "machine_learning": ["computer science"],
-      "management": ["business"],
-      "marketing": ["business"],
-      "medical_genetics": ["health"],
-      "miscellaneous": ["other"],
-      "moral_disputes": ["philosophy"],
-      "moral_scenarios": ["philosophy"],
-      "nutrition": ["health"],
-      "philosophy": ["philosophy"],
-      "prehistory": ["history"],
-      "professional_accounting": ["other"],
-      "professional_law": ["law"],
-      "professional_medicine": ["health"],
-      "professional_psychology": ["psychology"],
-      "public_relations": ["politics"],
-      "security_studies": ["politics"],
-      "sociology": ["culture"],
-      "us_foreign_policy": ["politics"],
-      "virology": ["health"],
-      "world_religions": ["philosophy"],
-  }
-
-  categories = {
-      "STEM": [
-          "physics",
-          "chemistry",
-          "biology",
-          "computer science",
-          "math",
-          "engineering",
-      ],
-      "humanities": ["history", "philosophy", "law"],
-      "social sciences": [
-          "politics",
-          "culture",
-          "economics",
-          "geography",
-          "psychology",
-      ],
-      "other (business, health, misc.)": ["other", "business", "health"],
-  }
-
-  if dataset_name == "mmlu":
-    # EITHER: filter by category
-    # category_names = [
-    #     "STEM",
-    #     "humanities",
-    #     "social sciences",
-    #     "other (business, health, misc.)",
-    # ]
-    category_names = [task_name]
-    folder_name = "test"  # one of {'auxiliary_train', 'dev', 'val', 'test'}
-    task_names = []
-    for task_csv_name in os.listdir(
-        os.path.join(root_data_folder_path, folder_name)
-    ):
-      task_names.append(task_csv_name.split(".")[0])
-
-    tasks_in_category = []
-    for category_name in category_names:
-      for task_name in task_names:
-        for subname in subcategories:
-          if subname in task_name:
-            if subcategories[subname][0] in categories[category_name]:
-              tasks_in_category.append(task_name)
-              break
-
-    tasks_all = [(folder_name, task_name) for task_name in tasks_in_category]
-    multiple_choice_tasks = set([item[1] for item in tasks_all])
-    boolean_tasks = set()
-    numerical_output_tasks = set()
-
-    # OR: filter by task
-    # tasks_all = [
-    #     # ('test', 'abstract_algebra_test'),
-    #     # ('test', 'college_computer_science_test'),
-    #     # ('test', 'college_mathematics_test'),
-    #     # ('test', 'college_physics_test'),
-    #     # ('test', 'elementary_mathematics_test'),
-    #     # ('test', 'global_facts_test'),
-    #     # ('test', 'high_school_physics_test'),
-    #     # ('test', 'machine_learning_test'),
-    #     # ('test', 'management_test'),
-    #     # ('test', 'medical_genetics_test'),
-    #     # ('test', 'moral_scenarios_test'),
-    #     # ('test', 'professional_psychology_test'),
-    #     # ('test', 'public_relations_test'),
-    #     # ('test', 'professional_law_test'),
-    #     # ('test', 'high_school_psychology_test'),
-    #     # ('test', 'high_school_world_history_test'),
-    #     # ('test', 'human_aging_test'),
-    #     # ('test', 'miscellaneous_test'),
-    #     # ('test', 'moral_scenarios_test'),
-    #     ('test', 'professional_psychology_test'),
-    #     # ('test', 'security_studies_test'),
-    # ]
-
-  elif dataset_name == "bbh":
-    tasks_all = [task_name]
-    assert (
-        len(tasks_all) == 1
-    ), "for now only support prompt optimization on one BBH task"
-
-    # all BBH tasks are as below
-    # tasks_all = [
-    #     'boolean_expressions',
-    #     'causal_judgement',
-    #     'date_understanding',
-    #     'disambiguation_qa',
-    #     'dyck_languages',
-    #     'formal_fallacies',
-    #     'geometric_shapes',
-    #     'hyperbaton',
-    #     'logical_deduction_five_objects',
-    #     'logical_deduction_seven_objects',
-    #     'logical_deduction_three_objects',
-    #     'movie_recommendation',
-    #     'multistep_arithmetic_two',
-    #     'navigate',
-    #     'object_counting',
-    #     'penguins_in_a_table',
-    #     'reasoning_about_colored_objects',
-    #     'ruin_names',
-    #     'salient_translation_error_detection',
-    #     'snarks',
-    #     'sports_understanding',
-    #     'temporal_sequences',
-    #     'tracking_shuffled_objects_five_objects',
-    #     'tracking_shuffled_objects_seven_objects',
-    #     'tracking_shuffled_objects_three_objects',
-    #     'web_of_lies',
-    #     'word_sorting'
-    # ]
-    numerical_output_tasks = {
-        "object_counting",
-        "multistep_arithmetic_two",
-    }
-
-    multiple_choice_tasks = {
-        "date_understanding",
-        "disambiguation_qa",
-        "geometric_shapes",
-        "hyperbaton",
-        "logical_deduction_five_objects",
-        "logical_deduction_seven_objects",
-        "logical_deduction_three_objects",
-        "movie_recommendation",
-        "penguins_in_a_table",
-        "reasoning_about_colored_objects",
-        "ruin_names",
-        "salient_translation_error_detection",
-        "snarks",
-        "temporal_sequences",
-        "tracking_shuffled_objects_five_objects",
-        "tracking_shuffled_objects_seven_objects",
-        "tracking_shuffled_objects_three_objects",
-    }
-
-    boolean_tasks = {
-        "boolean_expressions",  # True or False
-        "causal_judgement",  # yes or no
-        "formal_fallacies",  # valid or invalid
-        "navigate",  # yes or no
-        "sports_understanding",  # yes or no
-        "web_of_lies",  # yes or no
-    }
-
-  else:
-    assert dataset_name in {"gsm8k"}
-    tasks_all = [task_name]
-    multiple_choice_tasks = set()
-    boolean_tasks = set()
-    numerical_output_tasks = set(tasks_all)
-
-  if dataset_name == "mmlu":
-    raw_data = pd.DataFrame()
-    prediction_treat_as_number = False
-    prediction_treat_as_bool = False
-  elif dataset_name == "bbh":
-    raw_data = []
-    prediction_treat_as_number = bool(
-        tasks_all[0] in numerical_output_tasks
-    )  # for now only check the first task
-    prediction_treat_as_bool = bool(
-        tasks_all[0] in boolean_tasks
-    )  # for now only check the first task
-    print(
-        f"prediction_treat_as_number: {prediction_treat_as_number},"
-        f" prediction_treat_as_bool: {prediction_treat_as_bool}"
-    )
-  else:
-    assert dataset_name == "gsm8k"
-    raw_data = pd.DataFrame()
-    prediction_treat_as_number = True
-    prediction_treat_as_bool = False
-
-  for t in tasks_all:
-    if dataset_name == "mmlu":
-      folder_name = t[0]
-      task_name = t[1]
-      single_task_df = pd.read_csv(
-          os.path.join(root_data_folder_path, f"{folder_name}/{task_name}.csv"),
-          index_col=None,
-          header=None,
-      )
-      raw_data = pd.concat([raw_data, single_task_df])
-    elif dataset_name == "bbh":
-      task_name = t
-      single_task_list = opt_utils.load_bbh_task_data(
-          task_name, base_dir=root_data_folder_path
-      )
-      raw_data += single_task_list
-    else:
-      assert dataset_name == "gsm8k"
-      task_name = t
-      f_gsm = os.path.join(root_data_folder_path, f"gsm_{task_name}.tsv")
-      single_task_df = pd.read_csv(f_gsm, sep="\t", header=None)
-      raw_data = pd.concat([raw_data, single_task_df])
-
-  if dataset_name == "mmlu":
-    num_examples = raw_data.shape[0]
-  elif dataset_name == "bbh":
-    num_examples = len(raw_data)
-  else:
-    assert dataset_name in {"gsm8k"}
-    num_examples = raw_data.shape[0]
-  print(f"number of examples in the current task: {num_examples}")
-
-  # ================ split data into train/val/test ==========================
-  if dataset_name == "mmlu":
-    train_ratio = 0.8
-    eval_ratio = 0.2
-  elif dataset_name == "gsm8k":
-    train_ratio = 0.035
-    eval_ratio = 0
-  else:
-    assert dataset_name == "bbh"
-    train_ratio = 0.2
-    eval_ratio = 0
-
-  # train-validation-test split
-  # It is important to sort the indices, as this ensures the is_multiple_choice
-  # Boolean variables match the data points.
-  assert train_ratio + eval_ratio <= 1
-  test_ratio = 1 - train_ratio - eval_ratio
-  print(
-      f"train_ratio: {train_ratio}, eval_ratio: {eval_ratio}, "
-      f"test_ratio: {test_ratio}"
-  )
-  np.random.seed(0)
-  train_index = np.sort(
-      np.array(
-          np.random.choice(
-              num_examples, size=int(train_ratio * num_examples), replace=False
-          )
-      )
-  )
-  eval_and_test_index = np.sort(
-      np.array(list(set(np.arange(num_examples)) - set(train_index)))
-  )
-  eval_index = np.sort(
-      np.array(
-          np.random.choice(
-              eval_and_test_index,
-              size=int(eval_ratio * num_examples),
-              replace=False,
-          )
-      )
-  )
-
-  # ========== set other optimization experiment hyperparameters ==============
-  if scorer_llm_name == "text-bison":
-    old_instruction_score_threshold = 0.0 # 完全保留旧指令 表示不过滤任何历史指令（即使质量很低的旧指令也会保留）。
-    # old_instruction_score_threshold = 0.15  # for GSM8K
-  elif scorer_llm_name == "local":
-    old_instruction_score_threshold = 0.3
-  else:
-    assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"} # 模型校验
-    old_instruction_score_threshold = 0.3 # 过滤低质量旧指令
-
-  if scorer_llm_name == "text-bison":
-    extract_final_answer_by_prompting_again = False # 是否通过二次提示提取最终答案（例如从冗长响应中提取关键内容）
-    include_qa = False # 是否在元提示中包含问答对
-    evaluate_in_parallel = False # 是否并行评估
-  elif scorer_llm_name == "local":
-    extract_final_answer_by_prompting_again = True
-    include_qa = True
-    evaluate_in_parallel = True
-  else:
-    assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
-    extract_final_answer_by_prompting_again = False
-    include_qa = False
-    evaluate_in_parallel = False
-
-  optimizer_llm_temperature = optimizer_llm_dict["temperature"]
-
-  num_few_shot_questions_for_instruction_refinement = 3 # number of few-shot questions 每次优化指令时参考的少样本示例数量（Few-shot QA对）。
-
-  # To change the number of generated instructions in each step, one should
-  # edit the value of the variable below, instead of editing the number of
-  # decodes in model parameters, because those values are limited by model
-  # serving configs.
-  num_generated_instructions_in_each_step = 3 # number of generated instructions in each step 每轮搜索生成的候选指令数量。
-  num_search_steps = 50 # number of search steps 总优化迭代次数。
-
-  initial_instructions = [
-      "Let's solve the problem.",
-      # "",
-      # "The answer is",
-  ]
-  few_shot_qa_pairs = True #是否使用少样本示例指导指令生成。
-  # one of {'accumulative_most_frequent', 'current_most_frequent', 'random',
-  # 'constant'}
-  few_shot_selection_criteria = "random" #对多样性要求高时用 random，稳定性要求高时用 most_frequent。
-  # whether to evaluate generated instructions on the exemplars in meta-prompt
-  evaluate_generated_ins_on_few_shot = False # 是否评估新指令 开发阶段设为 True调试指令质量。
-  # whether to evaluate old instructions on the exemplars in the meta-prompt
-  evaluate_old_ins_on_few_shot = False # 是否评估旧指令  生产阶段设为 False加速运行。
-  # every this number of steps, compute the accuracies of current-step
-  # instructions on the validation set
-  eval_interval = 3 # 每N步在验证集上测试当前指令的准确率。
-
-  max_num_instructions = (
-      20  # 元提示中保留的历史指令数量上限。
-  )
-  # 将连续分数离散化为N档（如0-100整数），简化模型理解。
-  num_score_buckets = 100
-  # whether to put old instructions and scores to before exemplars in
-  # 控制元提示中历史指令和少样本示例的顺序。
-  meta_prompt_instructions_before_exemplars = True
-
-  # ===================== run prompt optimization ======================
-
-  assert few_shot_selection_criteria in {
-      "accumulative_most_frequent",
-      "current_most_frequent",
-      "random",
-      "constant",
-  }
-  evolution_kwargs = {
-      "num_search_steps": num_search_steps,
-      "old_instruction_score_threshold": old_instruction_score_threshold,
-      "scorer_llm_dict": scorer_llm_dict,
-      "optimizer_llm_dict": optimizer_llm_dict,
-      "extract_final_answer_by_prompting_again": (
-          extract_final_answer_by_prompting_again
-      ),
-      "include_qa": include_qa,
-      "evaluate_in_parallel": evaluate_in_parallel,
-      "tasks_all": tasks_all,
-      "train_ratio": train_ratio,
-      "eval_ratio": eval_ratio,
-      "test_ratio": test_ratio,
-      "train_index": train_index,
-      "eval_index": eval_index,
-      "dataset_name": dataset_name,
-      "task_name": task_name,
-      "num_examples": num_examples,
-      "root_data_folder_path": root_data_folder_path,
-      "optimizer_llm_temperature": optimizer_llm_temperature,
-      # "optimizer_llm_temperature_schedule": (
-      #     optimizer_llm_temperature_schedule
-      # ),
-      # "optimizer_llm_temperature_end": optimizer_llm_temperature_end,
-      "initial_instructions": initial_instructions,
-      "multiple_choice_tasks": multiple_choice_tasks,
-      "raw_data": raw_data,
-      "call_scorer_server_func": call_scorer_server_func,
-      "call_optimizer_server_func": call_optimizer_server_func,
-      "instruction_pos": instruction_pos,
-      "prediction_treat_as_number": prediction_treat_as_number,
-      "prediction_treat_as_bool": prediction_treat_as_bool,
-      "result_by_instruction_folder": result_by_instruction_folder,
-      "few_shot_qa_pairs": few_shot_qa_pairs,
-      "num_score_buckets": num_score_buckets,
-      "max_num_instructions": max_num_instructions,
-      "meta_prompt_type": meta_prompt_type,
-      "meta_prompt_instructions_before_exemplars": (
-          meta_prompt_instructions_before_exemplars
-      ),
-      "few_shot_selection_criteria": few_shot_selection_criteria,
-      "optimizer_llm_name": optimizer_llm_name,
-      "num_generated_instructions_in_each_step": (
-          num_generated_instructions_in_each_step
-      ),
-      "evaluate_generated_ins_on_few_shot": evaluate_generated_ins_on_few_shot,
-      "num_few_shot_questions_for_instruction_refinement": (
-          num_few_shot_questions_for_instruction_refinement
-      ),
-      "evaluate_old_ins_on_few_shot": evaluate_old_ins_on_few_shot,
-      "eval_interval": eval_interval,
-      "save_folder": save_folder,
-  }
-
-  opt_utils.run_evolution(**evolution_kwargs)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/outputs/user_feedback.jsonl b/outputs/user_feedback.jsonl
deleted file mode 100644
index e504227..0000000
--- a/outputs/user_feedback.jsonl
+++ /dev/null
@@ -1,2 +0,0 @@
-{"session_id": "06e84e4b03c54df188961f4e033f9e77", "round": 1, "choice": "哪些奶茶含有较少的糖分？", "answer": "选择低糖奶茶时，可以通过品牌选择、糖分选项调整以及配料搭配来控制摄入的糖分。以下是一些具体建议和推荐：\n\n---\n\n### **一、低糖奶茶品牌及选项**\n1. **喜茶（HEYTEA）**  \n   - **推荐系列**：轻乳茶（如「轻乳茉莉奶绿」）、部分限定款（如「芝士奶盖茶」可选低糖）。  \n   - **糖分控制**：默认糖度为50%，可选「三分糖」（约10%糖度）或「无糖」（使用代糖）。  \n   - **注意**：奶盖本身含糖，建议选择「无奶盖」选项。\n\n2. **奈雪的茶**  \n   - **推荐系列**：鲜果茶（如「霸气橙子」）、「鲜果轻乳茶」系列。  \n   - **糖分控制**：默认糖度为50%，可选「少糖」（约30%）或「无糖」（使用代糖）。  \n   - **注意**：鲜果茶的天然果糖含量较高，需结合糖分选项综合判断。\n\n3. **CoCo都可**  \n   - **推荐系列**：「鲜果茶」（如「百香果鲜果茶」）、「轻乳茶」。  \n   - **糖分控制**：默认全糖（约100%），可选「三分糖」（约30%）或「无糖」（代糖）。  \n   - **注意**：部分经典奶茶（如「珍珠奶茶」）糖分较高，建议选择鲜果类饮品。\n\n4. **一点点**  \n   - **推荐系列**：「鲜果茶」（如「百香果鲜果茶」）、「轻乳茶」。  \n   - **糖分控制**：默认全糖（约100%），可选「半糖」（50%）或「无糖」（代糖）。  \n   - **注意**：部分奶盖茶含糖量较高，建议选择鲜果或轻乳系列。\n\n---\n\n### **二、如何选择低糖奶茶？**\n1. **糖分选项优先级**  \n   - **无糖/代糖**：使用赤藓糖醇、甜菊糖等代糖，几乎不含热量（但需注意部分人对代糖敏感）。  \n   - **三分糖/半糖**：糖分约为全糖的30%-50%，适合控制摄入。  \n   - **避免全糖**：全糖奶茶的糖分通常在"}
-{"session_id": "af370656137a4b8f97d8ffac2c60b191", "round": 1, "choice": "瑞幸咖啡有哪些值得推荐的饮品？", "answer": "瑞幸咖啡近年来凭借高性价比和创新口味，推出了许多受欢迎的饮品。以下是一些值得推荐的饮品，涵盖经典款、特色新品和季节限定，适合不同口味和需求：\n\n---\n\n### **一、爆款必试系列**\n1. **生椰拿铁**  \n   - **推荐理由**：瑞幸的“现象级”产品，椰香浓郁与咖啡的苦香完美融合，口感顺滑，常年占据销量榜首。  \n   - **小贴士**：可尝试“生椰拿铁+巧克力糖浆”组合，变成“生椰摩卡”，风味更丰富。\n\n2. **丝绒拿铁**  \n   - **推荐理由**：以红茶为基底，搭配丝滑牛奶，茶香与奶香平衡，适合喜欢茶咖融合的人群。  \n   - **特色**：选用锡兰红茶，口感更醇厚，冷热皆宜。\n\n3. **厚乳拿铁**  \n   - **推荐理由**：使用厚乳（高乳脂含量的牛奶），奶香更浓郁，适合追求绵密口感的爱好者。\n\n---\n\n### **二、果味与创意系列**\n1. **冰椰拿铁**（夏季限定）  \n   - **推荐理由**：生椰拿铁的冰饮版本，加入冰块和椰香糖浆，清爽解暑，适合夏天。\n\n2. **蓝莓生椰拿铁**  \n   - **推荐理由**：在生椰拿铁基础上加入蓝莓糖浆，果香与椰香交织，甜而不腻。\n\n3. **蜜桃生椰拿铁**  \n   - **推荐理由**：蜜桃风味糖浆与生椰拿1:1搭配，清新果香与咖啡的苦香碰撞，适合喜欢果味的人。\n\n---\n\n### **三、季节限定款**\n1. **桂花拿铁**（秋季限定）  \n   - **推荐理由**：桂花糖浆与拿铁结合，香气扑鼻，甜度适中，是"}