diff --git a/opro_qwen_xinference_demo_package.zip b/opro_qwen_xinference_demo_package.zip deleted file mode 100644 index 187905f..0000000 Binary files a/opro_qwen_xinference_demo_package.zip and /dev/null differ diff --git a/optimization/__pycache__/opt_utils.cpython-310.pyc b/optimization/__pycache__/opt_utils.cpython-310.pyc deleted file mode 100644 index 2c7a4a9..0000000 Binary files a/optimization/__pycache__/opt_utils.cpython-310.pyc and /dev/null differ diff --git a/optimization/opt_utils.py b/optimization/opt_utils.py deleted file mode 100644 index 36ad3bf..0000000 --- a/optimization/opt_utils.py +++ /dev/null @@ -1,1035 +0,0 @@ -# Copyright 2023 The OPRO Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""The utility functions for prompt optimization.""" - -import collections -import json -import os -import pickle -import re -import sys - -OPRO_ROOT_PATH = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -) -sys.path.insert(0, OPRO_ROOT_PATH) - -import numpy as np -from opro.evaluation import eval_utils -import pandas as pd - - -def extract_string_in_square_brackets(input_string): - raw_result = re.findall(r"\[.*?\]", input_string) - if raw_result: - return raw_result[0][1:-1] - else: - return "" #从输入的字符串 input_string中提取​​第一个被方括号 []包裹的内容​​,并返回去掉方括号后的结果。如果输入字符串中没有方括号,则返回空字符串 ""。 - - -def parse_tag_content(text, prefix="", suffix=""): - pattern = f"{prefix}(.*?){suffix}" - results = re.findall(pattern, text, re.DOTALL) - return results - - -def _bucketize_float(num, n_buckets=20): - assert num >= 0 and num <= 1, "The given number must be between 0 and 1." - return round(num * n_buckets) #此函数可能用于将​​指令的评分​​(如 0.85)转换为整数分桶 - - -def gen_ins_and_score_pairs_substr( - old_instructions_and_scores, - old_instruction_score_threshold=0.1, - max_num_instructions=1000, - return_str_only=False, - num_score_buckets=np.inf, -): - """Generate the string that includes instruction-score pairs.""" - assert num_score_buckets == np.inf or isinstance(num_score_buckets, int) - old_instructions_and_scores_str = "" - old_instructions_and_scores = sorted( - old_instructions_and_scores, key=lambda x: x[1] - )[-max_num_instructions:] - old_instructions_and_scores_in_meta_prompt = [] - for instruction, score, i_step in old_instructions_and_scores: - if ( - not old_instruction_score_threshold - or score >= old_instruction_score_threshold - ): - old_instructions_and_scores_in_meta_prompt.append( - (instruction, score, i_step) - ) - if num_score_buckets == np.inf: - score_to_show = round(score, 3) - else: - score_to_show = _bucketize_float(score, num_score_buckets) - old_instructions_and_scores_str += ( - f"\ntext:\n{instruction}\nscore:\n{score_to_show}\n" - ) - if return_str_only: - return old_instructions_and_scores_str - else: - return ( - old_instructions_and_scores_str, - old_instructions_and_scores_in_meta_prompt, - ) - - -def gen_meta_prompt( - old_instructions_and_scores, - instruction_pos, - optimizer_llm_name, - old_instruction_score_threshold=0.1, - max_num_instructions=1000, - meta_prompt_type="both_instructions_and_exemplars", - few_shot_qa_pairs=False, - include_qa=True, - data=None, - few_shot_index_list=None, - instructions_before_exemplars=True, - num_score_buckets=np.inf, - dataset_name="", - task_name="", -): - """Generate meta prompt for instruction rewriting. - - Args: - old_instructions_and_scores (list): a list of (instruction, score, i_step) - pairs. - instruction_pos (str): where to put the instruction, one of {'before_QA', - 'Q_begin', 'Q_end', 'A_begin'}. - optimizer_llm_name (str): the name of the LLM used for instruction editing. - old_instruction_score_threshold (float): only add old instructions with score - no less than this threshold. - max_num_instructions (int): the maximum number of instructions in the meta - prompt. - meta_prompt_type (str): the type of meta-prompt: whether to have both - previous instructions and dataset exemplars (often for fine-tuned - optimizers), or to have only previous instructions (often for pre-trained - optimizers). - few_shot_qa_pairs (bool): whether to have few-shot QA pairs in the meta - prompt. - include_qa (bool): whether to include "Q:" and "A:" formats in the prompt. - data (list or pd.DataFrame): the raw data. - few_shot_index_list (list): the list of indices of few-shot examples. - instructions_before_exemplars (bool): whether the instruction-score pairs are - before the exemplars from the dataset. - num_score_buckets (np.inf or int): the number of score buckets when we - convert float accuracies to integers. Default to np.inf for not - bucketizing. - dataset_name (str): the name of the current dataset. Only used when - generating task description when meta_prompt_type == "instructions_only". - task_name (str): the name of the current task. Only used when generating task - description when meta_prompt_type == "instructions_only". - - Returns: - meta_prompt (str): the generated meta prompt. - """ - assert instruction_pos in { - "before_Q", - "Q_begin", - "Q_end", - "A_begin", - }, ( - "The instruction position should be either before the question, or at the" - " beginning of the question, at the end of the question, or at the" - " beginning of the answer." - ) - assert meta_prompt_type in { - "both_instructions_and_exemplars", - "instructions_only", - } - assert dataset_name in { - "mmlu", - "bbh", - "gsm8k", - }, "The lower-case dataset name must be one of mmlu, bbh, gsm8k." - assert num_score_buckets == np.inf or isinstance(num_score_buckets, int) - - meta_prompt = "" - if meta_prompt_type == "both_instructions_and_exemplars": - if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4","local"}: # GPT模型的提示模板 - if instruction_pos == "A_begin":# 针对答案开头部分的描述 - meta_prompt_old_instruction_part = ( - "Your task is to generate the answer starting sentence ." - " Below are some previous starting sentences with their scores." - " The score ranges from 0 to 100.\n" - )#要求模型生成答案的起始句(如“The answer is...”),并参考历史评分数据。 - else:# 针对普通指令的描述 - meta_prompt_old_instruction_part = ( - "Your task is to generate the instruction ." - " Below are some previous instructions with their scores." - " The score ranges from 0 to 100.\n" - ) - else: # text-bison 模型的专用提示模板 - assert optimizer_llm_name.lower() == "text-bison" - meta_prompt_old_instruction_part = ( - "I have some texts along with their corresponding scores." - " The texts are arranged in ascending order based on their scores," - " where higher scores indicate better quality.\n\n" - ) - # add old instructions - old_instructions_and_scores_str = gen_ins_and_score_pairs_substr( - old_instructions_and_scores=old_instructions_and_scores, - old_instruction_score_threshold=old_instruction_score_threshold, - max_num_instructions=max_num_instructions, - return_str_only=True, - num_score_buckets=num_score_buckets, - ) - meta_prompt_old_instruction_part += old_instructions_and_scores_str - # add QA pairs if few_shot_qa_pairs == True - meta_prompt_exemplar_part = "" - if few_shot_qa_pairs: - if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}: - meta_prompt_exemplar_part += "Below are some problems.\n" - else: - assert optimizer_llm_name.lower() == "text-bison" - meta_prompt_exemplar_part += ( - "The following exemplars show how to apply your text: you replace" - " in each input with your text, then read the input and give" - " an output. We say your output is wrong if your output is" - " different from the given output, and we say your output is" - " correct if they are the same. When replacing with an old" - " piece of text above, we get wrong outputs on the following" - " inputs.\n\n" - ) - for idx in few_shot_index_list: - if dataset_name == "mmlu": - question = eval_utils._format_mmlu_example(data, idx) # pylint: disable=protected-access - true_answer = data.iloc[idx, -1] - elif dataset_name == "bbh": - question = data[idx]["input"] - true_answer = data[idx]["target"] - else: - assert dataset_name == "gsm8k" - question = data.iloc[idx, 0] - true_answer = data.iloc[idx, 1] - - if include_qa: # when "Q:" and "A:" are present in the prompt - if instruction_pos == "before_Q": - meta_prompt_exemplar_part += f"\ninput:\n\nQ: {question}\nA:" - elif instruction_pos == "Q_begin": - meta_prompt_exemplar_part += f"\ninput:\nQ: \n{question}\nA:" - elif instruction_pos == "Q_end": - meta_prompt_exemplar_part += f"\ninput:\nQ: {question}\n\nA:" - else: # instruction_pos == "A_begin" - if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}: - meta_prompt_exemplar_part += f"\nQ: {question}\nA: " - else: - assert optimizer_llm_name.lower() == "text-bison" - meta_prompt_exemplar_part += f"\ninput:\nQ: {question}\nA: " - else: # when there're no "Q:" and "A:" in the prompt - assert instruction_pos in {"Q_begin", "Q_end"} - if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}: - if instruction_pos == "Q_begin": - meta_prompt_exemplar_part += f"\nProblem:\n\n{question}\n" - elif instruction_pos == "Q_end": - meta_prompt_exemplar_part += f"\nProblem:\n{question}\n\n" - else: - assert optimizer_llm_name.lower() == "text-bison" - if instruction_pos == "Q_begin": - meta_prompt_exemplar_part += f"\ninput:\n\n{question}\n" - elif instruction_pos == "Q_end": - meta_prompt_exemplar_part += f"\ninput:\n{question}\n\n" - - if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}: - meta_prompt_exemplar_part += ( - f"\nGround truth answer:\n{true_answer}\n" - ) - else: - assert optimizer_llm_name.lower() == "text-bison" - meta_prompt_exemplar_part += f"\noutput:\n{true_answer}\n" - - if few_shot_qa_pairs: - if instructions_before_exemplars: - meta_prompt += ( - meta_prompt_old_instruction_part - + "\n\n" - + meta_prompt_exemplar_part - ) - else: - meta_prompt += ( - meta_prompt_exemplar_part - + "\n\n" - + meta_prompt_old_instruction_part - ) - else: - meta_prompt += meta_prompt_old_instruction_part - - if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}: - if instruction_pos == "A_begin": - meta_prompt += ( - "\n\nGenerate a starting sentence that is different from all the" - " sentences above, and has a higher score than all the" - " sentences above. The starting sentence should begin with" - " and end with . The starting sentence should be" - " concise, effective, and generally applicable to all QA pairs" - " above." - ) - else: - meta_prompt += ( - "\n\nGenerate an instruction that" - " is different from all the instructions above," - " and has a higher score than all the instructions above." - " The instruction should begin with and end with ." - " The instruction should be concise, effective," - " and generally applicable to all problems above." - ) - else: - assert optimizer_llm_name.lower() == "text-bison" - meta_prompt += ( - "\n\nWrite your new text that is different from the old ones and" - " has a score as high as possible. Write the text in square brackets." - ) - else: - # when using a pre-trained model as optimizer - assert meta_prompt_type == "instructions_only" - - assert instruction_pos in {"Q_begin", "Q_end", "A_begin"} - if instruction_pos == "Q_begin": - instruction_pos_description = "at the beginning of the question" - elif instruction_pos == "Q_end": - instruction_pos_description = "at the end of the question" - else: - assert instruction_pos == "A_begin" - instruction_pos_description = "at the beginning of the answer" - - if dataset_name == "gsm8k": - instruction_task_description = "grade school math" - elif dataset_name == "mmlu": - instruction_task_description = task_name - else: - assert dataset_name == "bbh" - instruction_task_description = " ".join(task_name.split("_")) - - meta_instruction = ( - f"Create a piece of text {instruction_pos_description.strip()} to" - " enhance the precision in solving diverse" - f" {instruction_task_description.strip()} problems." - ) - old_instructions_and_scores = sorted( - old_instructions_and_scores, key=lambda x: x[1] - ) - old_instructions_and_scores_str = "" - for instruction, score, _ in old_instructions_and_scores: - if num_score_buckets == np.inf: - score_to_show = round(score, 2) - else: - score_to_show = _bucketize_float(score, num_score_buckets) - old_instructions_and_scores_str += ( - f"\n\nPrecision: {score_to_show} {instruction}" - ) - meta_prompt += meta_instruction + old_instructions_and_scores_str - return meta_prompt - - -def run_evolution(**kwargs): - """The function for evolution.""" - # ================= experiment configurations ============================= - num_search_steps = kwargs["num_search_steps"] - old_instruction_score_threshold = kwargs["old_instruction_score_threshold"] - scorer_llm_dict = kwargs["scorer_llm_dict"] - optimizer_llm_dict = kwargs["optimizer_llm_dict"] - extract_final_answer_by_prompting_again = kwargs[ - "extract_final_answer_by_prompting_again" - ] - include_qa = kwargs["include_qa"] - evaluate_in_parallel = kwargs["evaluate_in_parallel"] - tasks_all = kwargs["tasks_all"] - train_ratio = kwargs["train_ratio"] - eval_ratio = kwargs["eval_ratio"] - test_ratio = kwargs["test_ratio"] - train_index = kwargs["train_index"] - eval_index = kwargs["eval_index"] - dataset_name = kwargs["dataset_name"] - task_name = kwargs["task_name"] - num_examples = kwargs["num_examples"] - root_data_folder_path = kwargs["root_data_folder_path"] - optimizer_llm_temperature = kwargs["optimizer_llm_temperature"] - optimizer_llm_temperature_schedule = ( - kwargs["optimizer_llm_temperature_schedule"] - if "optimizer_llm_temperature_schedule" in kwargs - else "constant" - ) - optimizer_llm_temperature_end = ( - kwargs["optimizer_llm_temperature_end"] - if "optimizer_llm_temperature_end" in kwargs - else None - ) - initial_instructions = kwargs["initial_instructions"] - multiple_choice_tasks = kwargs["multiple_choice_tasks"] - raw_data = kwargs["raw_data"] - call_scorer_server_func = kwargs["call_scorer_server_func"] - call_optimizer_server_func = kwargs["call_optimizer_server_func"] - instruction_pos = kwargs["instruction_pos"] - prediction_treat_as_number = kwargs["prediction_treat_as_number"] - prediction_treat_as_bool = kwargs["prediction_treat_as_bool"] - result_by_instruction_folder = kwargs["result_by_instruction_folder"] - few_shot_qa_pairs = kwargs["few_shot_qa_pairs"] - num_score_buckets = kwargs["num_score_buckets"] - max_num_instructions = kwargs["max_num_instructions"] - meta_prompt_type = kwargs["meta_prompt_type"] - meta_prompt_instructions_before_exemplars = kwargs[ - "meta_prompt_instructions_before_exemplars" - ] - few_shot_selection_criteria = kwargs["few_shot_selection_criteria"] - optimizer_llm_name = kwargs["optimizer_llm_name"] - num_generated_instructions_in_each_step = kwargs[ - "num_generated_instructions_in_each_step" - ] - evaluate_generated_ins_on_few_shot = kwargs[ - "evaluate_generated_ins_on_few_shot" - ] - num_few_shot_questions_for_instruction_refinement = kwargs[ - "num_few_shot_questions_for_instruction_refinement" - ] - evaluate_old_ins_on_few_shot = kwargs["evaluate_old_ins_on_few_shot"] - eval_interval = kwargs["eval_interval"] - save_folder = kwargs["save_folder"] - verbose = kwargs["verbose"] if "verbose" in kwargs else False - - # =================== assertions ===================== - assert dataset_name in { - "mmlu", - "bbh", - "gsm8k", - }, "The lower-case dataset name must be one of mmlu, bbh, gsm8k." - assert optimizer_llm_temperature_schedule in { - "constant", - "linear_increase", - }, "The temperature schedule should be constant or linear_increase." - - # =================== save configurations to json file ==================== - configs_dict = dict() - configs_dict["scorer_llm_dict"] = scorer_llm_dict - configs_dict["optimizer_llm_dict"] = optimizer_llm_dict - configs_dict["instruction_pos"] = instruction_pos - configs_dict["optimizer_llm_temperature"] = optimizer_llm_temperature - configs_dict["optimizer_llm_temperature_schedule"] = ( - optimizer_llm_temperature_schedule - ) - configs_dict["optimizer_llm_temperature_end"] = optimizer_llm_temperature_end - with open(os.path.join(save_folder, "configs_dict.json"), "w") as f: - json.dump(configs_dict, f, indent=4) - - num_servers = scorer_llm_dict["num_servers"] - batch_size = scorer_llm_dict["batch_size"] - generated_ins_on_few_shot_results_dict = dict() - old_ins_on_few_shot_results_dict = dict() - # evaluation results every a few steps - # format: [(i_step, instruction, detailed_results_df)] - eval_results = [] - # all generated instructions, format: [(instruction, score, step_index)] - # the instructions that were skipped have score NaN - old_instructions_and_scores_raw = [] - # the new instructions, format: [(instruction, score, step_index)] - old_instructions_and_scores = [] - meta_prompts = [] # format: [(meta_prompt, step_index)] - instruction_score_dict = dict() # the dictionary of {instruction: score} - # the dictionary of the few-shot QA indices in meta-prompt - # key: step index; value: the list of few-shot indices in that step - few_shot_index_list_by_step_dict = dict() - detailed_results_df_by_instruction_dict = dict() - wrong_questions_from_start_counter = collections.Counter() - # EVAL results - eval_detailed_results_df_dict = dict() # {instruction: detailed_results_df} - instruction_eval_score_dict = dict() # {instruction: eval_score} - old_instruction_md5_hashstrings_set = set() - - print(f"tasks_all: {tasks_all}") - print( - f"train_ratio: {train_ratio}, number of training points:" - f" {int(num_examples * train_ratio)}" - ) - print( - f"eval_ratio: {eval_ratio}, number of eval points: " - f"{int(num_examples * eval_ratio)}" - ) - print( - f"test_ratio: {test_ratio}, number of test points: " - f"{int(num_examples * test_ratio)}" - ) - print( - f"optimizer llm temperature: {optimizer_llm_temperature}, schedule:" - f" {optimizer_llm_temperature_schedule}" - ) - print( - f"generating {num_generated_instructions_in_each_step} instructions in" - f" each step, run for {num_search_steps} steps" - ) - print( - "discarding generated instructions with score less than:" - f" {old_instruction_score_threshold} (old_instruction_score_threshold)" - ) - print(f"num_score_buckets: {num_score_buckets}") - - if dataset_name == "mmlu": - is_multiple_choice = True - is_multiple_choice_eval = True - elif dataset_name in {"gsm8k"}: - is_multiple_choice = False - is_multiple_choice_eval = False - else: - assert dataset_name == "bbh" - is_multiple_choice = [] - is_multiple_choice_eval = [] - train_index_by_task_dict = dict() - eval_index_by_task_dict = dict() - start_index = 0 - for task_name in tasks_all: - single_task_list = eval_utils.load_bbh_task_data( - task_name, base_dir=root_data_folder_path - ) - end_index = start_index + len(single_task_list) - train_index_by_task_dict[task_name] = ( - train_index[(train_index >= start_index) & (train_index < end_index)] - # if " - start_index" is added here, then the dict would contain - # indices in the original task - ) - eval_index_by_task_dict[task_name] = ( - eval_index[(eval_index >= start_index) & (eval_index < end_index)] - # if " - start_index" is added here, then the dict would contain - # indices in the original task - ) - start_index = end_index - is_multiple_choice_single_task_train = [ - task_name in multiple_choice_tasks - ] * len(train_index_by_task_dict[task_name]) - is_multiple_choice_single_task_eval = [ - task_name in multiple_choice_tasks - ] * len(eval_index_by_task_dict[task_name]) - is_multiple_choice += is_multiple_choice_single_task_train - is_multiple_choice_eval += is_multiple_choice_single_task_eval - - prev_saved_instructions = set() - - # evaluate initial instructions - print("\n============== evaluating initial instructions ===============") - for instruction in initial_instructions: - print(f"""computing the score of "{instruction}" by prompting""") - - detailed_results_df = eval_utils.evaluate_single_instruction( - data=raw_data, - instruction=instruction, - eval_index_all=train_index, - batch_size=batch_size, - call_server_func=call_scorer_server_func, - dataset_name=dataset_name, - num_servers=num_servers, - extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again, - include_qa=include_qa, - evaluate_in_parallel=evaluate_in_parallel, - instruction_pos=instruction_pos, - is_multiple_choice=is_multiple_choice, - prediction_treat_as_number=prediction_treat_as_number, - prediction_treat_as_bool=prediction_treat_as_bool, - prediction_num_decimals=0, - max_retry=120, - sleep_time=60, - verbose=verbose, - ) - - detailed_results_df_by_instruction_dict[instruction] = detailed_results_df - scores = detailed_results_df["accuracy"] - average_score = np.average(scores) - print(f"instruction: {instruction}, score: {average_score}") - filename = eval_utils.instruction_to_filename(instruction) - file_path = os.path.join(result_by_instruction_folder, f"{filename}.csv") - detailed_results_df.to_csv(file_path, index=True, header=True) - print(f"""saving results of "{instruction}" to {file_path}""") - old_instructions_and_scores.append((instruction, average_score, -1)) - old_instructions_and_scores_raw.append((instruction, average_score, -1)) - instruction_score_dict[instruction] = average_score - - # increment the counter on wrong questions - wrong_question_indices_set = set( - list( - detailed_results_df.iloc[ - np.where(detailed_results_df.accuracy == 0.0)[0], : - ].index - ) - ) - for idx in wrong_question_indices_set: - wrong_questions_from_start_counter[idx] += 1 - - # evolution - for i_step in range(num_search_steps): - print(f"\n================== Step {i_step} =====================") - if not i_step % 10: - print(f"old_instructions_and_scores: {old_instructions_and_scores}") - - if optimizer_llm_temperature_schedule == "linear_increase": - optimizer_llm_temperature_curr = ( - optimizer_llm_temperature - + i_step - / num_search_steps - * (optimizer_llm_temperature_end - optimizer_llm_temperature) - ) - else: - optimizer_llm_temperature_curr = optimizer_llm_temperature - print( - f"current optimizer_llm_temperature: {optimizer_llm_temperature_curr}" - ) - - # generate new instructions - if few_shot_qa_pairs: - if few_shot_selection_criteria == "accumulative_most_frequent": - # select QA pairs that were done wrong the most number of times - most_frequent_wrong_question_indices = [ - k - for k, _ in sorted( - wrong_questions_from_start_counter.items(), key=lambda x: -x[1] - ) - ] - print( - "len(most_frequent_wrong_question_indices):" - f" {len(most_frequent_wrong_question_indices)}" - ) - if ( - len(most_frequent_wrong_question_indices) - <= num_few_shot_questions_for_instruction_refinement - ): - few_shot_index_list = most_frequent_wrong_question_indices.copy() - else: - np.random.seed(i_step) - few_shot_index_list = np.sort( - np.random.choice( - most_frequent_wrong_question_indices, - num_few_shot_questions_for_instruction_refinement, - replace=False, - ) - ) - - elif few_shot_selection_criteria == "current_most_frequent": - # show exemplars done wrong most often by currently shown instructions - old_instruction_score_threshold_single_step = ( - old_instruction_score_threshold if i_step > 0 else 0 - ) - _, old_instructions_and_scores_in_meta_prompt = ( - gen_ins_and_score_pairs_substr( - old_instructions_and_scores=old_instructions_and_scores, - old_instruction_score_threshold=old_instruction_score_threshold_single_step, - max_num_instructions=max_num_instructions, - return_str_only=False, - num_score_buckets=num_score_buckets, - ) - ) - wrong_questions_counter_single_step = collections.Counter() - for ins, _, _ in old_instructions_and_scores_in_meta_prompt: - filename = eval_utils.instruction_to_filename(ins) - file_path = os.path.join( - result_by_instruction_folder, f"{filename}.csv" - ) - single_ins_df = pd.read_csv(file_path, index_col=0, header=0) - wrong_question_indices_set_single_old_ins = set( - list( - single_ins_df.iloc[ - np.where(single_ins_df.accuracy == 0.0)[0], : - ].index - ) - ) - for idx in wrong_question_indices_set_single_old_ins: - wrong_questions_counter_single_step[idx] += 1 - most_occurred_wrong_questions = [ - k - for k, v in wrong_questions_counter_single_step.items() - if v == max(wrong_questions_counter_single_step.values()) - ] - if ( - len(most_occurred_wrong_questions) - < num_few_shot_questions_for_instruction_refinement - ): - # pylint: disable=cell-var-from-loop - idx_most_to_least = sorted( - wrong_questions_counter_single_step, - key=lambda x: -wrong_questions_counter_single_step[x], - ) - few_shot_index_list = idx_most_to_least[ - :num_few_shot_questions_for_instruction_refinement - ] - else: - few_shot_index_list = np.sort( - np.random.choice( - most_occurred_wrong_questions, - num_few_shot_questions_for_instruction_refinement, - replace=False, - ) - ) - elif few_shot_selection_criteria == "constant": - np.random.seed(0) - few_shot_index_list = np.sort( - np.random.choice( - train_index, - num_few_shot_questions_for_instruction_refinement, - replace=False, - ) - ) - else: - assert few_shot_selection_criteria == "random" - np.random.seed(i_step) - few_shot_index_list = np.sort( - np.random.choice( - train_index, - num_few_shot_questions_for_instruction_refinement, - replace=False, - ) - ).tolist() - - few_shot_index_list_by_step_dict[i_step] = few_shot_index_list - - meta_prompt = gen_meta_prompt( - old_instructions_and_scores=old_instructions_and_scores, - instruction_pos=instruction_pos, - optimizer_llm_name=optimizer_llm_name, - old_instruction_score_threshold=old_instruction_score_threshold, - max_num_instructions=max_num_instructions, - meta_prompt_type=meta_prompt_type, - few_shot_qa_pairs=few_shot_qa_pairs, - include_qa=include_qa, - data=raw_data, - few_shot_index_list=few_shot_index_list, - instructions_before_exemplars=meta_prompt_instructions_before_exemplars, - num_score_buckets=num_score_buckets, - dataset_name=dataset_name, - task_name=task_name, - ) - - else: # no few-shot exemplars in meta-prompt - few_shot_index_list = [] - meta_prompt = gen_meta_prompt( - old_instructions_and_scores=old_instructions_and_scores, - instruction_pos=instruction_pos, - optimizer_llm_name=optimizer_llm_name, - old_instruction_score_threshold=old_instruction_score_threshold, - max_num_instructions=max_num_instructions, - meta_prompt_type=meta_prompt_type, - few_shot_qa_pairs=False, - include_qa=include_qa, - instructions_before_exemplars=meta_prompt_instructions_before_exemplars, - num_score_buckets=num_score_buckets, - dataset_name=dataset_name, - task_name=task_name, - ) - print(f"\nmeta_prompt: \n\n{meta_prompt}\n") - meta_prompts.append((meta_prompt, i_step)) - remaining_num_instructions_to_generate = ( - num_generated_instructions_in_each_step - ) - generated_instructions_raw = [] - while remaining_num_instructions_to_generate > 0: - optimizer_llm_input_text = meta_prompt - # generate instructions - print(f"current temperature: {optimizer_llm_temperature_curr}") - raw_outputs = call_optimizer_server_func( - optimizer_llm_input_text, - temperature=optimizer_llm_temperature_curr, - ) - - # Extract the generated instructions from the optimizer LLM output. Only - # keep some samples if the desired number of remaining instructions - # is smaller than the total number of decodes in this step. - if meta_prompt_type == "both_instructions_and_exemplars": - raw_outputs = raw_outputs[:remaining_num_instructions_to_generate] - if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}: - if instruction_pos == "A_begin": - start_string = "" - end_string = "" - else: - start_string = "+" - end_string = "" - for raw_output in raw_outputs: - if start_string not in raw_output: - start_index = 0 - else: - start_index = raw_output.index(start_string) + len(start_string) - if end_string not in raw_output: - end_index = len(raw_output) - else: - end_index = raw_output.index(end_string) - new_inst = raw_output[start_index:end_index].strip() - generated_instructions_raw.append(new_inst) - else: - assert optimizer_llm_name.lower() == "text-bison" - generated_instructions_raw += [ - extract_string_in_square_brackets(string) - for string in raw_outputs - ] - - remaining_num_instructions_to_generate -= optimizer_llm_dict[ - "batch_size" - ] - else: - assert meta_prompt_type == "instructions_only" - max_num_instructions_to_keep_in_each_output = 1 - for string in raw_outputs: - generated_instructions_raw += parse_tag_content(string)[ - :max_num_instructions_to_keep_in_each_output - ] - remaining_num_instructions_to_generate -= ( - optimizer_llm_dict["batch_size"] - * max_num_instructions_to_keep_in_each_output - ) - - generated_instructions_raw = list( - map(eval_utils.polish_sentence, generated_instructions_raw) - ) - print(f"\ninitially generated instructions: {generated_instructions_raw}\n") - - # do not evaluate old instructions again - generated_instructions = [] # the new instructions generated in this step - for ins in generated_instructions_raw: - ins_md5_hashstring = eval_utils.instruction_to_filename( - ins, md5_hashing=True - ) - if ins_md5_hashstring not in old_instruction_md5_hashstrings_set: - generated_instructions.append(ins) - old_instruction_md5_hashstrings_set.add(ins_md5_hashstring) - else: - print(f"already evaluated '{ins}' previously") - generated_instructions = list(set(generated_instructions)) - - to_evaluate_instructions = [] - for instruction in generated_instructions: - if len(instruction) > 500: - print(f"Step {i_step}, instruction: {instruction}, too long, skipped") - continue - if dataset_name == "gsm8k" and any( - char.isdigit() for char in instruction - ): - print( - f"Step {i_step}, instruction: {instruction}, contains numbers," - " skipped" - ) - continue - if "INS" in instruction: - print( - f"Step {i_step}, instruction: {instruction}, contains 'INS'," - " skipped" - ) - continue - to_evaluate_instructions.append(instruction) - print(f"\nto-evaluate generated instructions: {to_evaluate_instructions}\n") - - # evaluate new instructions on the few-shot exemplars in meta-prompt - if few_shot_qa_pairs and evaluate_generated_ins_on_few_shot: - print("evaluating GENERATED instructions on few-shot exemplars") - single_step_eval_on_few_shot = dict() - for instruction in to_evaluate_instructions: - if instruction not in prev_saved_instructions: - print( - f"evaluating Step {i_step}, instruction: {instruction} on" - " few-shot exemplars" - ) - detailed_results_df = eval_utils.evaluate_single_instruction( - data=raw_data, - instruction=instruction, - eval_index_all=few_shot_index_list, - batch_size=batch_size, - call_server_func=call_scorer_server_func, - dataset_name=dataset_name, - num_servers=num_servers, - extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again, - include_qa=include_qa, - evaluate_in_parallel=evaluate_in_parallel, - instruction_pos=instruction_pos, - is_multiple_choice=is_multiple_choice, - prediction_treat_as_number=prediction_treat_as_number, - prediction_treat_as_bool=prediction_treat_as_bool, - prediction_num_decimals=0, - max_retry=5, - sleep_time=180, - verbose=verbose, - ) - single_step_eval_on_few_shot[instruction] = detailed_results_df - - print( - f"Step {i_step}, single_step_eval_on_few_shot:" - f" {single_step_eval_on_few_shot}\n" - ) - generated_ins_on_few_shot_results_dict[i_step] = ( - single_step_eval_on_few_shot - ) - - # evaluate OLD instructions on the few-shot exemplars in meta-prompt - if few_shot_qa_pairs and evaluate_old_ins_on_few_shot: - print("evaluating OLD instructions on few-shot exemplars") - single_step_eval_on_few_shot = dict() - for instruction, _, _ in old_instructions_and_scores: - print( - f"evaluating Step {i_step}, instruction: {instruction} on few-shot" - " exemplars" - ) - detailed_results_df = eval_utils.evaluate_single_instruction( - data=raw_data, - instruction=instruction, - eval_index_all=few_shot_index_list, - batch_size=scorer_llm_dict["batch_size"], - call_server_func=call_scorer_server_func, - dataset_name=dataset_name, - num_servers=scorer_llm_dict["num_servers"], - extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again, - include_qa=include_qa, - evaluate_in_parallel=evaluate_in_parallel, - instruction_pos=instruction_pos, - is_multiple_choice=is_multiple_choice, - prediction_treat_as_number=prediction_treat_as_number, - prediction_treat_as_bool=prediction_treat_as_bool, - prediction_num_decimals=0, - max_retry=5, - sleep_time=180, - verbose=verbose, - ) - single_step_eval_on_few_shot[instruction] = detailed_results_df - - print( - f"Step {i_step}, single_step_eval_on_few_shot:" - f" {single_step_eval_on_few_shot}\n" - ) - old_ins_on_few_shot_results_dict[i_step] = single_step_eval_on_few_shot - - # evaluate newly generated instructions on the training set - for instruction in to_evaluate_instructions: - if instruction not in prev_saved_instructions: - print(f"""computing the score of "{instruction}" by prompting""") - detailed_results_df = eval_utils.evaluate_single_instruction( - data=raw_data, - instruction=instruction, - eval_index_all=train_index, - batch_size=batch_size, - call_server_func=call_scorer_server_func, - dataset_name=dataset_name, - num_servers=num_servers, - extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again, - include_qa=include_qa, - evaluate_in_parallel=evaluate_in_parallel, - instruction_pos=instruction_pos, - is_multiple_choice=is_multiple_choice, - prediction_treat_as_number=prediction_treat_as_number, - prediction_treat_as_bool=prediction_treat_as_bool, - prediction_num_decimals=0, - max_retry=5, - sleep_time=180, - verbose=verbose, - ) - prev_saved_instructions.add(instruction) - else: - # do not re-evaluate instructions that had been evaluated previously - detailed_results_df = pd.read_csv( - os.path.join(result_by_instruction_folder, f"{instruction}.csv"), - index_col=0, - header=0, - ) - print(f"""reading previously saved "{instruction}" information""") - - scores = detailed_results_df["accuracy"] - average_score = np.average(scores) - print( - f"Step {i_step}, instruction: {instruction}, score: {average_score}" - ) - - # increment the counter on wrong questions - wrong_question_indices_set = set( - list( - detailed_results_df[detailed_results_df["accuracy"] == 0.0].index - ) - ) - for idx in wrong_question_indices_set: - wrong_questions_from_start_counter[idx] += 1 - - filename = eval_utils.instruction_to_filename(instruction) - file_path = os.path.join( - result_by_instruction_folder, f"""{filename}.csv""" - ) - detailed_results_df.to_csv(file_path, index=True, header=True) - print(f"saving results to {file_path}") - - detailed_results_df_by_instruction_dict[instruction] = detailed_results_df - old_instructions_and_scores.append((instruction, average_score, i_step)) - instruction_score_dict[instruction] = average_score - - # record all generated instructions - for instruction in generated_instructions_raw: - if instruction in instruction_score_dict: - average_score = instruction_score_dict[instruction] - else: - average_score = np.nan - old_instructions_and_scores_raw.append( - (instruction, average_score, i_step) - ) - - # =============================== eval ==================================== - # every eval_interval steps, evaluate the instructions that were generated - # in the current step and were not skipped - if not i_step % eval_interval: - for instruction in generated_instructions_raw: - # if the instruction wasn't skipped in any step - if instruction in instruction_score_dict: - if instruction not in instruction_eval_score_dict: - detailed_results_df = eval_utils.evaluate_single_instruction( - data=raw_data, - instruction=instruction, - eval_index_all=eval_index, - batch_size=batch_size, - call_server_func=call_scorer_server_func, - dataset_name=dataset_name, - num_servers=num_servers, - extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again, - include_qa=include_qa, - evaluate_in_parallel=evaluate_in_parallel, - instruction_pos=instruction_pos, - is_multiple_choice=is_multiple_choice_eval, - prediction_treat_as_number=prediction_treat_as_number, - prediction_treat_as_bool=prediction_treat_as_bool, - prediction_num_decimals=0, - max_retry=5, - sleep_time=180, - verbose=verbose, - ) - eval_score = np.average(detailed_results_df["accuracy"]) - eval_detailed_results_df_dict[instruction] = detailed_results_df - instruction_eval_score_dict[instruction] = eval_score - else: - eval_score = instruction_eval_score_dict[instruction] - print( - f"EVAL: \nStep {i_step}, instruction: {instruction}, eval score:" - f" {eval_score:.2f}" - ) - eval_results.append((i_step, instruction, eval_score)) - - # ===================== save up-to-date results =========================== - results_dict = dict() - results_dict["meta_prompts"] = meta_prompts - results_dict["old_instructions_and_scores"] = list( - old_instructions_and_scores - ) - results_dict["old_instructions_and_scores_raw"] = list( - old_instructions_and_scores_raw - ) - results_dict["generated_ins_on_few_shot_results_dict"] = ( - generated_ins_on_few_shot_results_dict - ) - results_dict["old_ins_on_few_shot_results_dict"] = ( - old_ins_on_few_shot_results_dict - ) - results_dict["few_shot_index_list_by_step_dict"] = ( - few_shot_index_list_by_step_dict - ) - results_dict["eval_results"] = eval_results - results_dict["eval_detailed_results_df_dict"] = ( - eval_detailed_results_df_dict - ) - with open(os.path.join(save_folder, "results_dict.pkl"), "wb") as fp: - pickle.dump(results_dict, fp) - print(f"\nsaved all results to\n{save_folder}") diff --git a/optimization/optimize_instructions.py b/optimization/optimize_instructions.py deleted file mode 100644 index e619ffb..0000000 --- a/optimization/optimize_instructions.py +++ /dev/null @@ -1,736 +0,0 @@ - -import datetime -import functools -import os -import sys - -OPRO_ROOT_PATH = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -) -sys.path.insert(0, OPRO_ROOT_PATH) - -from absl import app -from absl import flags -import google.generativeai as palm -import numpy as np -import openai -from opro import prompt_utils -from opro.optimization import opt_utils -import pandas as pd -FLAGS = flags.FLAGS -ROOT_DATA_FOLDER_PATH = os.path.join(OPRO_ROOT_PATH, "data") - -flags.DEFINE_string("local_model_path", "", "Path to local vLLM model.") - -_OPENAI_API_KEY = flags.DEFINE_string( - "openai_api_key", "", "The OpenAI API key." -) - -_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.") - -_SCORER = flags.DEFINE_string( - "scorer", "text-bison", "The name of the scorer LLM." -) - -_OPTIMIZER = flags.DEFINE_string( - "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM." -) - -_DATASET = flags.DEFINE_string( - "dataset", "gsm8k", "The name of dataset to search for instructions on." -) - -_TASK = flags.DEFINE_string( - "task", - "train", - "The name of task within the above dataset to search for instructions on.", -) - -_INSTRUCTION_POS = flags.DEFINE_string( - "instruction_pos", - "A_begin", - "The position of the instruction to search for.", -) - -_META_PROMPT_TYPE = flags.DEFINE_string( - "meta_prompt_type", - "both_instructions_and_exemplars", - "The type of meta-prompt: whether to have both previous instructions and" - " dataset exemplars (often for fine-tuned optimizers), or to have only" - " previous instructions (often for pre-trained optimizers).", -) - - -def main(_): - local_model_path = FLAGS.local_model_path - openai_api_key = _OPENAI_API_KEY.value - palm_api_key = _PALM_API_KEY.value - scorer_llm_name = _SCORER.value - optimizer_llm_name = _OPTIMIZER.value - dataset_name = _DATASET.value.lower() - task_name = _TASK.value - meta_prompt_type = _META_PROMPT_TYPE.value - - assert dataset_name in { - "mmlu", - "bbh", - "gsm8k", - }, "The lower-case dataset name must be one of mmlu, bbh, or gsm8k." - if dataset_name == "mmlu": - assert task_name in { - "STEM", - "humanities", - "social sciences", - "other (business, health, misc.)", - } # for now only support searching on one MMLU category - elif dataset_name == "bbh": - assert task_name in { - "boolean_expressions", - "causal_judgement", - "date_understanding", - "disambiguation_qa", - "dyck_languages", - "formal_fallacies", - "geometric_shapes", - "hyperbaton", - "logical_deduction_five_objects", - "logical_deduction_seven_objects", - "logical_deduction_three_objects", - "movie_recommendation", - "multistep_arithmetic_two", - "navigate", - "object_counting", - "penguins_in_a_table", - "reasoning_about_colored_objects", - "ruin_names", - "salient_translation_error_detection", - "snarks", - "sports_understanding", - "temporal_sequences", - "tracking_shuffled_objects_five_objects", - "tracking_shuffled_objects_seven_objects", - "tracking_shuffled_objects_three_objects", - "web_of_lies", - "word_sorting", - } - else: - assert dataset_name == "gsm8k" - assert task_name in {"train", "test"} - - assert scorer_llm_name in { - "text-bison", - "gpt-3.5-turbo", - "gpt-4", - "local", - } - assert optimizer_llm_name in { - "text-bison", - "gpt-3.5-turbo", - "gpt-4", - "local", - } - assert meta_prompt_type in { - "both_instructions_and_exemplars", - "instructions_only", - } - - instruction_pos = _INSTRUCTION_POS.value - assert instruction_pos in { - "before_Q", - "Q_begin", - "Q_end", - "A_begin", - }, ( - "The instruction position should be either before the question, or at the" - " beginning of the question, at the end of the question, or at the" - " beginning of the answer." - ) - print( - f"scorer: {scorer_llm_name}, optimizer: {optimizer_llm_name}, dataset:" - f" {dataset_name}, task: {task_name}, instruction_pos: {instruction_pos}" - ) - - - if scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}: - assert openai_api_key, "The OpenAI API key must be provided." - openai.api_key = openai_api_key - elif scorer_llm_name == "text-bison": - assert palm_api_key, "A PaLM API key is needed when prompting the text-bison model." - palm.configure(api_key=palm_api_key) - elif scorer_llm_name == "local": - # 本地模型,无需 API key - pass - else: - raise ValueError(f"Unknown scorer model: {scorer_llm_name}") - - if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}: - assert openai_api_key, "The OpenAI API key must be provided." - openai.api_key = openai_api_key - elif optimizer_llm_name == "text-bison": - assert palm_api_key, "A PaLM API key is needed when prompting the text-bison model." - palm.configure(api_key=palm_api_key) - elif optimizer_llm_name == "local": - # 本地模型,无需 API key - pass - else: - raise ValueError(f"Unknown scorer model: {optimizer_llm_name}") - - - if dataset_name == "mmlu": - root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "MMLU-data") - elif dataset_name == "bbh": - root_data_folder_path = os.path.join( - ROOT_DATA_FOLDER_PATH, "BIG-Bench-Hard-data/" - ) - else: - assert dataset_name == "gsm8k" - root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "gsm_data") - - # =================== create the result directory ========================== - datetime_str = ( - str(datetime.datetime.now().replace(microsecond=0)) - .replace(" ", "-") - .replace(":", "-") - ) - - - save_folder = os.path.join( - OPRO_ROOT_PATH, - "outputs", - "optimization-results", - f"{dataset_name.upper()}-{task_name}-s-{scorer_llm_name}-o-{optimizer_llm_name}-{datetime_str}/", -) - - - result_by_instruction_folder = os.path.join( - save_folder, "result_by_instruction" - ) - print(f"Results will be saved to: {os.path.abspath(result_by_instruction_folder)}") - os.makedirs(result_by_instruction_folder,exist_ok=True) - print(f"result directory:\n{save_folder}") - - # ====================== scorer model configs ============================== - - - if scorer_llm_name == "text-bison": - # when prompting text-bison with Cloud API - scorer_finetuned_palm_temperature = 0.0 - scorer_finetuned_palm_max_decode_steps = 1024 - scorer_finetuned_palm_batch_size = 1 - scorer_finetuned_palm_num_servers = 1 - scorer_finetuned_palm_dict = dict() - scorer_finetuned_palm_dict["temperature"] = ( - scorer_finetuned_palm_temperature - ) - scorer_finetuned_palm_dict["num_servers"] = ( - scorer_finetuned_palm_num_servers - ) - scorer_finetuned_palm_dict["batch_size"] = scorer_finetuned_palm_batch_size - scorer_finetuned_palm_dict["max_decode_steps"] = ( - scorer_finetuned_palm_max_decode_steps - ) - - call_scorer_finetuned_palm_server_func = functools.partial( - prompt_utils.call_palm_server_from_cloud, - model="text-bison-001", - temperature=scorer_finetuned_palm_dict["temperature"], - max_decode_steps=scorer_finetuned_palm_dict["max_decode_steps"], - ) - - scorer_llm_dict = { - "model_type": scorer_llm_name.lower(), - } - scorer_llm_dict.update(scorer_finetuned_palm_dict) - call_scorer_server_func = call_scorer_finetuned_palm_server_func - - - elif scorer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}: - # 改成调用本地vLLM版本的函数 - scorer_gpt_max_decode_steps = 1024 - # scorer_gpt_max_decode_steps = 512 - scorer_gpt_temperature = 0.0 - - scorer_llm_dict = { - "model_type": scorer_llm_name.lower(), - "max_decode_steps": scorer_gpt_max_decode_steps, - "temperature": scorer_gpt_temperature, - "num_decodes": 1, - "batch_size": 1, - "num_servers": 1, - } - - call_scorer_server_func = functools.partial( - prompt_utils.call_openai_server_func, # 你本地实现的vLLM调用函数 - max_decode_steps=scorer_gpt_max_decode_steps, - temperature=scorer_gpt_temperature, - local_model_path=FLAGS.local_model_path, # 传入你本地模型路径 - ) - - else: - raise ValueError(f"Unsupported scorer_llm_name: {scorer_llm_name}") - - - # ====================== optimizer model configs ============================ - if optimizer_llm_name.lower() == "text-bison": - # when prompting text-bison with Cloud API - optimizer_finetuned_palm_temperature = 1.0 - optimizer_finetuned_palm_num_decodes = 8 - optimizer_finetuned_palm_max_decode_steps = 1024 - optimizer_finetuned_palm_batch_size = 1 - optimizer_finetuned_palm_num_servers = 1 - optimizer_finetuned_palm_dict = dict() - optimizer_finetuned_palm_dict["temperature"] = ( - optimizer_finetuned_palm_temperature - ) - optimizer_finetuned_palm_dict["num_decodes"] = ( - optimizer_finetuned_palm_num_decodes - ) - optimizer_finetuned_palm_dict["batch_size"] = ( - optimizer_finetuned_palm_batch_size - ) - optimizer_finetuned_palm_dict["num_servers"] = ( - optimizer_finetuned_palm_num_servers - ) - optimizer_finetuned_palm_dict["max_decode_steps"] = ( - optimizer_finetuned_palm_max_decode_steps - ) - - call_optimizer_finetuned_palm_server_func = functools.partial( - prompt_utils.call_palm_server_from_cloud, - model="text-bison-001", - temperature=optimizer_finetuned_palm_dict["temperature"], - max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"], - ) - - optimizer_llm_dict = { - "model_type": optimizer_llm_name.lower(), - } - optimizer_llm_dict.update(optimizer_finetuned_palm_dict) - call_optimizer_server_func = call_optimizer_finetuned_palm_server_func - - elif optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}: - # 用本地 vLLM 版本替代调用 - optimizer_gpt_max_decode_steps = 512 - - optimizer_gpt_temperature = 1.0 - - optimizer_llm_dict = { - "max_decode_steps": optimizer_gpt_max_decode_steps, - "temperature": optimizer_gpt_temperature, - "batch_size": 1, - "num_decodes": 1, - } - - call_optimizer_server_func = functools.partial( - prompt_utils.call_openai_server_func, # 你写的本地vLLM调用接口 - max_decode_steps=optimizer_gpt_max_decode_steps, - temperature=optimizer_gpt_temperature, - local_model_path=FLAGS.local_model_path, - ) - - else: - raise ValueError(f"Unsupported optimizer_llm_name: {optimizer_llm_name}") - - - - # ====================== try calling the servers ============================ - print("\n======== testing the scorer and optimizer servers ===========") - scorer_test_output = call_scorer_server_func( - "Does the sun rise from the north? Just answer yes or no." - ) - print(f"number of scorer output decodes: {len(scorer_test_output)}") - print(f"scorer test output: {scorer_test_output}") - optimizer_test_output = call_optimizer_server_func( - "Does the sun rise from the north? Just answer yes or no.", - temperature=1.0, - ) - print(f"number of optimizer output decodes: {len(optimizer_test_output)}") - print(f"optimizer test output: {optimizer_test_output}") - print("Finished testing the servers.") - - # ====================== read data ============================ - print("\n================ prompt optimization settings ==============") - # from https://github.com/hendrycks/test/blob/master/categories.py - subcategories = { - "abstract_algebra": ["math"], - "anatomy": ["health"], - "astronomy": ["physics"], - "business_ethics": ["business"], - "clinical_knowledge": ["health"], - "college_biology": ["biology"], - "college_chemistry": ["chemistry"], - "college_computer_science": ["computer science"], - "college_mathematics": ["math"], - "college_medicine": ["health"], - "college_physics": ["physics"], - "computer_security": ["computer science"], - "conceptual_physics": ["physics"], - "econometrics": ["economics"], - "electrical_engineering": ["engineering"], - "elementary_mathematics": ["math"], - "formal_logic": ["philosophy"], - "global_facts": ["other"], - "high_school_biology": ["biology"], - "high_school_chemistry": ["chemistry"], - "high_school_computer_science": ["computer science"], - "high_school_european_history": ["history"], - "high_school_geography": ["geography"], - "high_school_government_and_politics": ["politics"], - "high_school_macroeconomics": ["economics"], - "high_school_mathematics": ["math"], - "high_school_microeconomics": ["economics"], - "high_school_physics": ["physics"], - "high_school_psychology": ["psychology"], - "high_school_statistics": ["math"], - "high_school_us_history": ["history"], - "high_school_world_history": ["history"], - "human_aging": ["health"], - "human_sexuality": ["culture"], - "international_law": ["law"], - "jurisprudence": ["law"], - "logical_fallacies": ["philosophy"], - "machine_learning": ["computer science"], - "management": ["business"], - "marketing": ["business"], - "medical_genetics": ["health"], - "miscellaneous": ["other"], - "moral_disputes": ["philosophy"], - "moral_scenarios": ["philosophy"], - "nutrition": ["health"], - "philosophy": ["philosophy"], - "prehistory": ["history"], - "professional_accounting": ["other"], - "professional_law": ["law"], - "professional_medicine": ["health"], - "professional_psychology": ["psychology"], - "public_relations": ["politics"], - "security_studies": ["politics"], - "sociology": ["culture"], - "us_foreign_policy": ["politics"], - "virology": ["health"], - "world_religions": ["philosophy"], - } - - categories = { - "STEM": [ - "physics", - "chemistry", - "biology", - "computer science", - "math", - "engineering", - ], - "humanities": ["history", "philosophy", "law"], - "social sciences": [ - "politics", - "culture", - "economics", - "geography", - "psychology", - ], - "other (business, health, misc.)": ["other", "business", "health"], - } - - if dataset_name == "mmlu": - - category_names = [task_name] - folder_name = "test" # one of {'auxiliary_train', 'dev', 'val', 'test'} - task_names = [] - for task_csv_name in os.listdir( - os.path.join(root_data_folder_path, folder_name) - ): - task_names.append(task_csv_name.split(".")[0]) - - tasks_in_category = [] - for category_name in category_names: - for task_name in task_names: - for subname in subcategories: - if subname in task_name: - if subcategories[subname][0] in categories[category_name]: - tasks_in_category.append(task_name) - break - - tasks_all = [(folder_name, task_name) for task_name in tasks_in_category] - multiple_choice_tasks = set([item[1] for item in tasks_all]) - boolean_tasks = set() - numerical_output_tasks = set() - - - elif dataset_name == "bbh": - tasks_all = [task_name] - assert ( - len(tasks_all) == 1 - ), "for now only support prompt optimization on one BBH task" - - - numerical_output_tasks = { - "object_counting", - "multistep_arithmetic_two", - } - - multiple_choice_tasks = { - "date_understanding", - "disambiguation_qa", - "geometric_shapes", - "hyperbaton", - "logical_deduction_five_objects", - "logical_deduction_seven_objects", - "logical_deduction_three_objects", - "movie_recommendation", - "penguins_in_a_table", - "reasoning_about_colored_objects", - "ruin_names", - "salient_translation_error_detection", - "snarks", - "temporal_sequences", - "tracking_shuffled_objects_five_objects", - "tracking_shuffled_objects_seven_objects", - "tracking_shuffled_objects_three_objects", - } - - boolean_tasks = { - "boolean_expressions", # True or False - "causal_judgement", # yes or no - "formal_fallacies", # valid or invalid - "navigate", # yes or no - "sports_understanding", # yes or no - "web_of_lies", # yes or no - } - - else: - assert dataset_name in {"gsm8k"} - tasks_all = [task_name] - multiple_choice_tasks = set() - boolean_tasks = set() - numerical_output_tasks = set(tasks_all) - - if dataset_name == "mmlu": - raw_data = pd.DataFrame() - prediction_treat_as_number = False - prediction_treat_as_bool = False - elif dataset_name == "bbh": - raw_data = [] - prediction_treat_as_number = bool( - tasks_all[0] in numerical_output_tasks - ) # for now only check the first task - prediction_treat_as_bool = bool( - tasks_all[0] in boolean_tasks - ) # for now only check the first task - print( - f"prediction_treat_as_number: {prediction_treat_as_number}," - f" prediction_treat_as_bool: {prediction_treat_as_bool}" - ) - else: - assert dataset_name == "gsm8k" - raw_data = pd.DataFrame() - prediction_treat_as_number = True - prediction_treat_as_bool = False - - for t in tasks_all: - if dataset_name == "mmlu": - folder_name = t[0] - task_name = t[1] - single_task_df = pd.read_csv( - os.path.join(root_data_folder_path, f"{folder_name}/{task_name}.csv"), - index_col=None, - header=None, - ) - raw_data = pd.concat([raw_data, single_task_df]) - elif dataset_name == "bbh": - task_name = t - single_task_list = opt_utils.load_bbh_task_data( - task_name, base_dir=root_data_folder_path - ) - raw_data += single_task_list - else: - assert dataset_name == "gsm8k" - task_name = t - f_gsm = os.path.join(root_data_folder_path, f"gsm_{task_name}.tsv") - single_task_df = pd.read_csv(f_gsm, sep="\t", header=None) - raw_data = pd.concat([raw_data, single_task_df]) - - if dataset_name == "mmlu": - num_examples = raw_data.shape[0] - elif dataset_name == "bbh": - num_examples = len(raw_data) - else: - assert dataset_name in {"gsm8k"} - num_examples = raw_data.shape[0] - print(f"number of examples in the current task: {num_examples}") - - # ================ split data into train/val/test ========================== - if dataset_name == "mmlu": - train_ratio = 0.8 - - eval_ratio = 0.2 - elif dataset_name == "gsm8k": -# train_ratio = 0.035 - train_ratio = 0.01 # 原来是 0.035,改成 0.01,约 74 条 - eval_ratio = 0 - else: - assert dataset_name == "bbh" - train_ratio = 0.2 - eval_ratio = 0 - - - assert train_ratio + eval_ratio <= 1 - test_ratio = 1 - train_ratio - eval_ratio - print( - f"train_ratio: {train_ratio}, eval_ratio: {eval_ratio}, " - f"test_ratio: {test_ratio}" - ) - np.random.seed(0) - train_index = np.sort( - np.array( - np.random.choice( - num_examples, size=int(train_ratio * num_examples), replace=False - ) - ) - ) - eval_and_test_index = np.sort( - np.array(list(set(np.arange(num_examples)) - set(train_index))) - ) - eval_index = np.sort( - np.array( - np.random.choice( - eval_and_test_index, - size=int(eval_ratio * num_examples), - replace=False, - ) - ) - ) - - # ========== set other optimization experiment hyperparameters ============== - if scorer_llm_name == "text-bison": - old_instruction_score_threshold = 0.0 - # old_instruction_score_threshold = 0.15 # for GSM8K - else: - assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4", "local"} - old_instruction_score_threshold = 0.3 - - if scorer_llm_name == "text-bison": - extract_final_answer_by_prompting_again = False - include_qa = False - evaluate_in_parallel = False - else: - assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4", "local"} - extract_final_answer_by_prompting_again = False - include_qa = False - evaluate_in_parallel = False - - optimizer_llm_temperature = optimizer_llm_dict["temperature"] - - -# num_few_shot_questions_for_instruction_refinement = 3 - num_few_shot_questions_for_instruction_refinement = 1 # 减少 few-shot 例子数 - -# num_generated_instructions_in_each_step = 8 - num_generated_instructions_in_each_step = 2 # 每步只生成 2 条指令 - -# num_search_steps = 200 - num_search_steps = 3 # 原来是 200,改成 3 步即可 - - - initial_instructions = [ - "Let's solve the problem.", - # "", - # "The answer is", - ] - few_shot_qa_pairs = True - # one of {'accumulative_most_frequent', 'current_most_frequent', 'random', - # 'constant'} - few_shot_selection_criteria = "random" - # whether to evaluate generated instructions on the exemplars in meta-prompt - evaluate_generated_ins_on_few_shot = False - # whether to evaluate old instructions on the exemplars in the meta-prompt - evaluate_old_ins_on_few_shot = False - # every this number of steps, compute the accuracies of current-step - # instructions on the validation set -# eval_interval = 3 - eval_interval = 1 # 每步就 eval 一次,及时看到结果 - # eval_interval = 10 - max_num_instructions = ( - 20 # the maximum number of instructions and scores in the meta-prompt - ) - # The number of buckets when converting scores to integers in the meta-prompt. - num_score_buckets = 100 - # whether to put old instructions and scores to before exemplars in - # the meta-prompt - meta_prompt_instructions_before_exemplars = True - - # ===================== run prompt optimization ====================== - - assert few_shot_selection_criteria in { - "accumulative_most_frequent", - "current_most_frequent", - "random", - "constant", - } - evolution_kwargs = { - "num_search_steps": num_search_steps, - "old_instruction_score_threshold": old_instruction_score_threshold, - "scorer_llm_dict": scorer_llm_dict, - "optimizer_llm_dict": optimizer_llm_dict, - "extract_final_answer_by_prompting_again": ( - extract_final_answer_by_prompting_again - ), - "include_qa": include_qa, - "evaluate_in_parallel": evaluate_in_parallel, - "tasks_all": tasks_all, - "train_ratio": train_ratio, - "eval_ratio": eval_ratio, - "test_ratio": test_ratio, - "train_index": train_index, - "eval_index": eval_index, - "dataset_name": dataset_name, - "task_name": task_name, - "num_examples": num_examples, - "root_data_folder_path": root_data_folder_path, - "optimizer_llm_temperature": optimizer_llm_temperature, - # "optimizer_llm_temperature_schedule": ( - # optimizer_llm_temperature_schedule - # ), - # "optimizer_llm_temperature_end": optimizer_llm_temperature_end, - "initial_instructions": initial_instructions, - "multiple_choice_tasks": multiple_choice_tasks, - "raw_data": raw_data, - "call_scorer_server_func": call_scorer_server_func, - "call_optimizer_server_func": call_optimizer_server_func, - "instruction_pos": instruction_pos, - "prediction_treat_as_number": prediction_treat_as_number, - "prediction_treat_as_bool": prediction_treat_as_bool, - "result_by_instruction_folder": result_by_instruction_folder, - "few_shot_qa_pairs": few_shot_qa_pairs, - "num_score_buckets": num_score_buckets, - "max_num_instructions": max_num_instructions, - "meta_prompt_type": meta_prompt_type, - "meta_prompt_instructions_before_exemplars": ( - meta_prompt_instructions_before_exemplars - ), - "few_shot_selection_criteria": few_shot_selection_criteria, - "optimizer_llm_name": optimizer_llm_name, - "num_generated_instructions_in_each_step": ( - num_generated_instructions_in_each_step - ), - "evaluate_generated_ins_on_few_shot": evaluate_generated_ins_on_few_shot, - "num_few_shot_questions_for_instruction_refinement": ( - num_few_shot_questions_for_instruction_refinement - ), - "evaluate_old_ins_on_few_shot": evaluate_old_ins_on_few_shot, - "eval_interval": eval_interval, - "save_folder": save_folder, - } - print("=== 开始优化过程 ===") - try: - opt_utils.run_evolution(**evolution_kwargs) - print("=== 优化完成 ===") - except Exception as e: - import traceback - print(f"!!! 优化失败: {e} !!!", file=sys.stderr) - traceback.print_exc() - sys.exit(1) - - -if __name__ == "__main__": - app.run(main) - diff --git a/optimization/optimize_linear_regression.py b/optimization/optimize_linear_regression.py deleted file mode 100644 index f856434..0000000 --- a/optimization/optimize_linear_regression.py +++ /dev/null @@ -1,424 +0,0 @@ -# Copyright 2023 The OPRO Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -r"""Optimize over the objective function of a linear regression problem. - -Usage: - -``` -python optimize_linear_regression.py --optimizer="text-bison" -``` - -Note: -- When using a Google-Cloud-served model (like text-bison at -https://developers.generativeai.google/tutorials/text_quickstart), add -`--palm_api_key=""` -- When using an OpenAI model, add `--openai_api_key=""` -""" - -import datetime -import functools -import json -import os -import re -import sys - -OPRO_ROOT_PATH = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -) -sys.path.insert(0, OPRO_ROOT_PATH) - -from absl import app -from absl import flags -import google.generativeai as palm -import numpy as np -import openai - -from opro import prompt_utils - -_OPENAI_API_KEY = flags.DEFINE_string( - "openai_api_key", "", "The OpenAI API key." -) - -_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.") - -_OPTIMIZER = flags.DEFINE_string( - "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM." -) - - -def main(_): - # ============== set optimization experiment configurations ================ - num_points = 50 # number of points in linear regression - w_true = 15 # the true w - b_true = 14 # the true b - max_num_steps = 500 # the number of optimization steps - num_reps = 5 # the number of repeated runs - max_num_pairs = 20 # the maximum number of input-output pairs in meta-prompt - num_input_decimals = 0 # num of decimals for input values in meta-prompt - num_output_decimals = 0 # num of decimals for output values in meta-prompt - num_generated_points_in_each_step = 8 - - # ================ load LLM settings =================== - optimizer_llm_name = _OPTIMIZER.value - assert optimizer_llm_name in { - "text-bison", - "gpt-3.5-turbo", - "gpt-4", - } - openai_api_key = _OPENAI_API_KEY.value - palm_api_key = _PALM_API_KEY.value - - if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}: - assert openai_api_key, "The OpenAI API key must be provided." - openai.api_key = openai_api_key - else: - assert optimizer_llm_name == "text-bison" - assert ( - palm_api_key - ), "A PaLM API key is needed when prompting the text-bison model." - palm.configure(api_key=palm_api_key) - - # =================== create the result directory ========================== - datetime_str = ( - str(datetime.datetime.now().replace(microsecond=0)) - .replace(" ", "-") - .replace(":", "-") - ) - - save_folder = os.path.join( - OPRO_ROOT_PATH, - "outputs", - "optimization-results", - f"linear_regression-o-{optimizer_llm_name}-{datetime_str}/", - ) - os.makedirs(save_folder) - print(f"result directory:\n{save_folder}") - - # ====================== optimizer model configs ============================ - if optimizer_llm_name.lower() == "text-bison": - # when prompting text-bison with Cloud API - optimizer_finetuned_palm_temperature = 1.0 - optimizer_finetuned_palm_max_decode_steps = 1024 - optimizer_finetuned_palm_batch_size = 1 - optimizer_finetuned_palm_num_servers = 1 - optimizer_finetuned_palm_dict = dict() - optimizer_finetuned_palm_dict["temperature"] = ( - optimizer_finetuned_palm_temperature - ) - optimizer_finetuned_palm_dict["batch_size"] = ( - optimizer_finetuned_palm_batch_size - ) - optimizer_finetuned_palm_dict["num_servers"] = ( - optimizer_finetuned_palm_num_servers - ) - optimizer_finetuned_palm_dict["max_decode_steps"] = ( - optimizer_finetuned_palm_max_decode_steps - ) - - call_optimizer_finetuned_palm_server_func = functools.partial( - prompt_utils.call_palm_server_from_cloud, - # prompt_utils.call_vllm, - model="text-bison-001", - temperature=optimizer_finetuned_palm_dict["temperature"], - max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"], - ) - - optimizer_llm_dict = { - "model_type": optimizer_llm_name.lower(), - } - optimizer_llm_dict.update(optimizer_finetuned_palm_dict) - call_optimizer_server_func = call_optimizer_finetuned_palm_server_func - - else: - assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"} - optimizer_gpt_max_decode_steps = 1024 - optimizer_gpt_temperature = 1.0 - - optimizer_llm_dict = dict() - optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps - optimizer_llm_dict["temperature"] = optimizer_gpt_temperature - optimizer_llm_dict["batch_size"] = 1 - call_optimizer_server_func = functools.partial( - prompt_utils.call_openai_server_func, - model=optimizer_llm_name, - max_decode_steps=optimizer_gpt_max_decode_steps, - temperature=optimizer_gpt_temperature, - ) - - # ====================== try calling the servers ============================ - print("\n======== testing the optimizer server ===========") - optimizer_test_output = call_optimizer_server_func( - "Does the sun rise from the north? Just answer yes or no.", - temperature=1.0, - ) - print(f"optimizer test output: {optimizer_test_output}") - print("Finished testing the optimizer server.") - print("\n=================================================") - - # ====================== utility functions ============================ - def evaluate_loss(X, y, w, b): # pylint: disable=invalid-name - residual = y - (X * w + b) - return np.linalg.norm(residual) ** 2 - - def gen_meta_prompt( - old_value_pairs_set, - X, # pylint: disable=invalid-name, unused-argument - y, # pylint: disable=unused-argument - num_input_decimals=5, - num_output_decimals=5, - max_num_pairs=100, - ): - """Generate the meta-prompt for optimization. - - Args: - old_value_pairs_set (set): the set of old (w, b, z) pairs. - X (np.array): the 1D array of x values. - y (np.array): the 1D array of y values. - num_input_decimals (int): the number of decimals for (w, b) in the - meta-prompt. - num_output_decimals (int): the number of decimals for z in the meta-prompt. - max_num_pairs (int): the maximum number of exemplars in the meta-prompt. - - Returns: - meta_prompt (str): the generated meta-prompt. - """ - old_value_pairs_set = set( - [ # pylint: disable=g-complex-comprehension - ( - np.round(w, num_input_decimals) - if num_input_decimals > 0 - else int(w), - np.round(b, num_input_decimals) - if num_input_decimals > 0 - else int(b), - np.round(z, num_output_decimals) - if num_output_decimals > 0 - else int(z), - ) - for w, b, z in old_value_pairs_set - ] - ) - old_value_pairs = list(old_value_pairs_set) - old_value_pairs = sorted(old_value_pairs, key=lambda x: -x[2])[ - -max_num_pairs: - ] - old_value_pairs_substr = "" - for w, b, z in old_value_pairs: - old_value_pairs_substr += f"\ninput:\nw={w}, b={b}\nvalue:\n{z}\n" - meta_prompt = """ - Now you will help me minimize a function with two input variables w, b. I have some (w, b) pairs and the function values at those points. The pairs are arranged in descending order based on their function values, where lower values are better. - """.strip() - meta_prompt += "\n\n" - meta_prompt += old_value_pairs_substr.strip() - meta_prompt += "\n\n" - # function_analytic_form = "" - # for xi, yi in zip(X, y): - # function_analytic_form += f"({yi:.4f} - ({xi:.4f} * w + b)) ** 2 + " - # function_analytic_form = function_analytic_form[:-3] - # meta_prompt += ( - # "The function has the analytic form f(w, b) =" - # f" {function_analytic_form}. When evaluating the value of a (w, b)" - # " pair, you should replace the w and b in the analytic form with your" - # " values and do the computation." - # ) - # meta_prompt += "\n\n" - meta_prompt += """Give me a new (w, b) pair that is different from all pairs above, and has a function value lower than any of the above. Do not write code. The output must end with a pair [w, b], where w and b are numerical values. - """.strip() - return meta_prompt - - def extract_string_in_square_brackets(input_string): - raw_result = re.findall(r"\[.*?\]", input_string) - if raw_result: - for pair in raw_result[::-1]: - if "=" not in pair and ("w" in pair or "b" in pair): - continue - return pair[1:-1] - return "" - else: - return "" - - def parse_output(extracted_output): - """Parse the extracted output 'w, b' string to np.array([w, b]). - - Args: - extracted_output (str): the extracted output string, like '1.5, 2.5'. - - Returns: - parsed_output (np.array): the parsed output in a numpy array, like [1.5, - 2.5]. - """ - if not extracted_output: - return - extracted_values = [] - for item in extracted_output.split(","): - if "=" in item: - item = item[item.index("=") + 1 :] - extracted_values.append(item.strip()) - parsed_output = np.array(extracted_values).astype(float) - return parsed_output - - configs_dict = dict() - results_dict = dict() - num_convergence_steps = [] - for i_rep in range(num_reps): - found_optimal = False - print(f"\nRep {i_rep}:") - - # ================= generate the ground truth X, y ===================== - X = np.arange(num_points).astype(float) + 1 # pylint: disable=invalid-name - np.random.seed(i_rep + 1) - y = X * w_true + b_true + np.random.randn(num_points) - loss_at_true_values = evaluate_loss(X, y, w_true, b_true) - print(f"value at (w_true, b_true): {loss_at_true_values}") - - # ================= generate the starting points ===================== - num_starting_points = 5 # the number of initial points for optimization - np.random.seed((i_rep + 1) * 10) - init_w = np.random.uniform(low=10, high=20, size=num_starting_points) - np.random.seed((i_rep + 1) * 100) - init_b = np.random.uniform(low=10, high=20, size=num_starting_points) - - # ====================== run optimization ============================ - configs_dict_single_rep = { - "optimizer_llm_configs": optimizer_llm_dict, - "data": { - "num_points": num_points, - "w_true": w_true, - "b_true": b_true, - "loss_at_true_values": loss_at_true_values, - "X": list(X), - "y": list(y), - }, - "init_w": list(init_w), - "init_b": list(init_b), - "max_num_steps": max_num_steps, - "max_num_pairs": max_num_pairs, - "num_input_decimals": num_input_decimals, - "num_output_decimals": num_output_decimals, - "num_generated_points_in_each_step": num_generated_points_in_each_step, - } - configs_dict[i_rep] = configs_dict_single_rep - configs_json_path = os.path.join(save_folder, "configs.json") - print(f"saving configs to\n{configs_json_path}") - with open(configs_json_path, "w") as f: - json.dump(configs_dict, f, indent=4) - - old_value_pairs_set = set() - old_value_pairs_with_i_step = [] # format: [(w, b, z = f(w, b), i_step)] - meta_prompts_dict = dict() # format: {i_step: meta_prompt} - raw_outputs_dict = dict() # format: {i_step: raw_outputs} - - rounded_inits = [ - (np.round(w, num_input_decimals), np.round(b, num_input_decimals)) - for w, b in zip(init_w, init_b) - ] - rounded_inits = [ - tuple(item) for item in list(np.unique(rounded_inits, axis=0)) - ] - for w, b in rounded_inits: - z = evaluate_loss(X, y, w, b) - old_value_pairs_set.add((w, b, z)) - old_value_pairs_with_i_step.append((w, b, z, -1)) - - print("\n================ run optimization ==============") - print( - f"initial points: {[tuple(item[:2]) for item in old_value_pairs_set]}" - ) - print(f"initial values: {[item[-1] for item in old_value_pairs_set]}") - results_json_path = os.path.join(save_folder, "results.json") - print(f"saving results to\n{results_json_path}") - - for i_step in range(max_num_steps): - print(f"\nStep {i_step}:") - meta_prompt = gen_meta_prompt( - old_value_pairs_set, - X, - y, - num_input_decimals=num_input_decimals, - num_output_decimals=num_output_decimals, - max_num_pairs=max_num_pairs, - ) - if not i_step % 5: - print("\n=================================================") - print(f"meta_prompt:\n{meta_prompt}") - meta_prompts_dict[i_step] = meta_prompt - - # generate a maximum of the given number of points in each step - remaining_num_points_to_generate = num_generated_points_in_each_step - raw_outputs = [] - while remaining_num_points_to_generate > 0: - raw_outputs += call_optimizer_server_func(meta_prompt) - remaining_num_points_to_generate -= optimizer_llm_dict["batch_size"] - raw_outputs = raw_outputs[:num_generated_points_in_each_step] - - raw_outputs_dict[i_step] = raw_outputs - parsed_outputs = [] - for string in raw_outputs: - if not i_step % 5: - print("\n=================================================") - print("raw output:\n", string) - print("\n=================================================") - try: - parsed_output = parse_output( - extract_string_in_square_brackets(string) - ) - if parsed_output is not None and len(parsed_output) == 2: - parsed_outputs.append(parsed_output) - except ValueError: - pass - parsed_outputs = [tuple(item) for item in parsed_outputs] - print(f"proposed points before rounding: {parsed_outputs}") - - # round the proposed points to the number of decimals in meta-prompt - rounded_outputs = [ - (np.round(w, num_input_decimals), np.round(b, num_input_decimals)) - for w, b in parsed_outputs - ] - rounded_outputs = [ - tuple(item) for item in list(np.unique(rounded_outputs, axis=0)) - ] - print(f"proposed points after rounding: {rounded_outputs}") - - # evaluate the values of proposed and rounded outputs - single_step_values = [] - for w, b in rounded_outputs: - if w == w_true and b == b_true: - found_optimal = True - z = evaluate_loss(X, y, w, b) - single_step_values.append(z) - old_value_pairs_set.add((w, b, z)) - old_value_pairs_with_i_step.append((w, b, z, i_step)) - print(f"single_step_values: {single_step_values}") - - # ====================== save results ============================ - results_dict_single_rep = { - "meta_prompts": meta_prompts_dict, - "raw_outputs": raw_outputs_dict, - "old_value_pairs_with_i_step": old_value_pairs_with_i_step, - } - results_dict[i_rep] = results_dict_single_rep - with open(results_json_path, "w") as f: - json.dump(results_dict, f, indent=4) - if found_optimal: - print( - f"Repetition {i_rep+1}, optimal found at Step {i_step+1}, saving" - f" final results to\n{save_folder}" - ) - num_convergence_steps.append(i_step + 1) - break - print(f"num_convergence_steps: {num_convergence_steps}") - - -if __name__ == "__main__": - app.run(main) diff --git a/optimization/optimize_tsp.py b/optimization/optimize_tsp.py deleted file mode 100644 index a8f99d9..0000000 --- a/optimization/optimize_tsp.py +++ /dev/null @@ -1,430 +0,0 @@ -# Copyright 2024 The OPRO Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -r"""Optimize over the objective function of a traveling salesman problem. - -Usage: - -``` -python optimize_tsp.py --optimizer="text-bison" -``` - -Note: -- When using a Google-Cloud-served model (like text-bison at -https://developers.generativeai.google/tutorials/text_quickstart), add -`--palm_api_key=""` -- When using an OpenAI model, add `--openai_api_key=""` -""" - -import datetime -import functools -import getpass -import json -import os -import re -import sys -import itertools - -OPRO_ROOT_PATH = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -) -sys.path.insert(0, OPRO_ROOT_PATH) - -from absl import app -from absl import flags -import google.generativeai as palm -import numpy as np -import openai - -from opro import prompt_utils - - -_OPENAI_API_KEY = flags.DEFINE_string( - "openai_api_key", "", "The OpenAI API key." -) - -_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.") - -_OPTIMIZER = flags.DEFINE_string( - "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM." -) - -_START_ALGORITHM = flags.DEFINE_string( - "starting_algorithm", "farthest_insertion", "The name of the starting algorithm. Select from [dp, nearest_neighbor, farthest_insertion]" -) - -def main(_): - # ============== set optimization experiment configurations ================ - num_points = 100 # number of points in TSP - num_steps = 500 # the number of optimization steps - max_num_pairs = 10 # the maximum number of input-output pairs in meta-prompt - num_decimals = 0 # num of decimals for distances in meta-prompt - num_starting_points = 5 # the number of initial points for optimization - num_decode_per_step = 8 # the number of decoded solutions per step - - # ================ load LLM settings =================== - optimizer_llm_name = _OPTIMIZER.value - assert optimizer_llm_name in { - "text-bison", - "gpt-3.5-turbo", - "gpt-4", - } - openai_api_key = _OPENAI_API_KEY.value - palm_api_key = _PALM_API_KEY.value - - if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}: - assert openai_api_key, "The OpenAI API key must be provided." - openai.api_key = openai_api_key - else: - assert optimizer_llm_name == "text-bison" - assert ( - palm_api_key - ), "A PaLM API key is needed when prompting the text-bison model." - palm.configure(api_key=palm_api_key) - - # =================== create the result directory ========================== - datetime_str = ( - str(datetime.datetime.now().replace(microsecond=0)) - .replace(" ", "-") - .replace(":", "-") - ) - - save_folder = os.path.join( - OPRO_ROOT_PATH, - "outputs", - "optimization-results", - f"tsp-o-{optimizer_llm_name}-{datetime_str}/", - ) - os.makedirs(save_folder) - print(f"result directory:\n{save_folder}") - - # ====================== optimizer model configs ============================ - if optimizer_llm_name.lower() == "text-bison": - # when prompting text-bison with Cloud API - optimizer_finetuned_palm_temperature = 1.0 - optimizer_finetuned_palm_max_decode_steps = 1024 - optimizer_finetuned_palm_batch_size = 1 - optimizer_finetuned_palm_num_servers = 1 - optimizer_finetuned_palm_dict = dict() - optimizer_finetuned_palm_dict["temperature"] = ( - optimizer_finetuned_palm_temperature - ) - optimizer_finetuned_palm_dict["batch_size"] = ( - optimizer_finetuned_palm_batch_size - ) - optimizer_finetuned_palm_dict["num_servers"] = ( - optimizer_finetuned_palm_num_servers - ) - optimizer_finetuned_palm_dict["max_decode_steps"] = ( - optimizer_finetuned_palm_max_decode_steps - ) - - call_optimizer_finetuned_palm_server_func = functools.partial( - prompt_utils.call_palm_server_from_cloud, - # prompt_utils.call_vllm, - model="text-bison-001", - temperature=optimizer_finetuned_palm_dict["temperature"], - max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"], - ) - - optimizer_llm_dict = { - "model_type": optimizer_llm_name.lower(), - } - optimizer_llm_dict.update(optimizer_finetuned_palm_dict) - call_optimizer_server_func = call_optimizer_finetuned_palm_server_func - - else: - assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"} - optimizer_gpt_max_decode_steps = 1024 - optimizer_gpt_temperature = 1.0 - - optimizer_llm_dict = dict() - optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps - optimizer_llm_dict["temperature"] = optimizer_gpt_temperature - optimizer_llm_dict["batch_size"] = 1 - call_optimizer_server_func = functools.partial( - prompt_utils.call_openai_server_func, - model=optimizer_llm_name, - max_decode_steps=optimizer_gpt_max_decode_steps, - temperature=optimizer_gpt_temperature, - ) - - # ====================== try calling the servers ============================ - print("\n======== testing the optimizer server ===========") - optimizer_test_output = call_optimizer_server_func( - "Does the sun rise from the north? Just answer yes or no.", - temperature=1.0, - ) - print(f"optimizer test output: {optimizer_test_output}") - print("Finished testing the optimizer server.") - print("\n=================================================") - - # ====================== utility functions ============================ - def evaluate_distance(x, y, trace, num_decimals): # pylint: disable=invalid-name - dis = 0 - try: - for i in range(len(trace) - 1): - id0 = trace[i] - id1 = trace[i + 1] - dis += np.sqrt((x[id0] - x[id1]) ** 2 + (y[id0] - y[id1]) ** 2) - except: - return -1 - id0 = trace[-1] - id1 = trace[0] - dis += np.sqrt((x[id0] - x[id1]) ** 2 + (y[id0] - y[id1]) ** 2) - dis = np.round(dis, num_decimals) if num_decimals > 0 else int(dis) - return dis - - def solve_tsp(x, y, num_points, num_decimals, starting_algorithm): - if starting_algorithm == "nearest_neighbor": - min_dis = 0 - gt_sol = [0] - remaining_points = list(range(1, num_points)) - while len(remaining_points) > 0: - min_p = -1 - min_cur_dis = -1 - for p in remaining_points: - cur_dis = np.sqrt((x[p] - x[gt_sol[-1]]) ** 2 + (y[p] - y[gt_sol[-1]]) ** 2) - if min_p == -1 or cur_dis < min_cur_dis: - min_p = p - min_cur_dis = cur_dis - gt_sol.append(min_p) - min_dis += min_cur_dis - remaining_points.remove(min_p) - min_dis += np.sqrt((x[0] - x[gt_sol[-1]]) ** 2 + (y[0] - y[gt_sol[-1]]) ** 2) - min_dis = np.round(min_dis, num_decimals) if num_decimals > 0 else int(min_dis) - return gt_sol, min_dis - elif starting_algorithm == "farthest_insertion": - gt_sol = [0] - remaining_points = list(range(1, num_points)) - while len(remaining_points) > 0: - max_p = -1 - max_cur_dis = -1 - max_cur_index = -1 - for p in remaining_points: - min_cur_dis = -1 - min_cur_index = -1 - for index in range(1, len(gt_sol) + 1): - new_sol = gt_sol[:index] + [p] + gt_sol[index:] - cur_dis = evaluate_distance(x, y, new_sol, num_decimals) - if min_cur_dis == -1 or cur_dis < min_cur_dis: - min_cur_dis = cur_dis - min_cur_index = index - if max_cur_dis == -1 or min_cur_dis > max_cur_dis: - max_p = p - max_cur_dis = min_cur_dis - max_cur_index = min_cur_index - gt_sol = gt_sol[:max_cur_index] + [max_p] + gt_sol[max_cur_index:] - remaining_points.remove(max_p) - min_dis = evaluate_distance(x, y, gt_sol, num_decimals) - return gt_sol, min_dis - - f = {(0, 1): (0, [0])} - q = [(0, 1)] - min_dis = -1 - gt_sol = list(range(num_points)) - while len(q) > 0: - p, status = q[0] - q = q[1:] - for i in range(num_points): - if 2 << i >> 1 & status == 0: - new_status = status + (2 << i >> 1) - new_dis = f[(p, status)][0] + np.sqrt((x[i] - x[p]) ** 2 + (y[i] - y[p]) ** 2) - if (i, new_status) not in f or new_dis < f[(i, new_status)][0]: - f[(i, new_status)] = (new_dis, f[(p, status)][1] + [i]) - if new_status == (2 << num_points >> 1) - 1: - new_dis += np.sqrt((x[i] - x[0]) ** 2 + (y[i] - y[0]) ** 2) - if min_dis == -1 or new_dis < min_dis: - min_dis = new_dis - gt_sol = f[(i, new_status)][1][:] - elif (i, new_status) not in q: - q.append((i, new_status)) - min_dis = np.round(min_dis, num_decimals) if num_decimals > 0 else int(min_dis) - return gt_sol, min_dis - - def gen_meta_prompt( - old_value_pairs_set, - x, # pylint: disable=invalid-name - y, - max_num_pairs=100, - ): - """Generate the meta-prompt for optimization. - - Args: - old_value_pairs_set (set): the set of old traces. - X (np.array): the 1D array of x values. - y (np.array): the 1D array of y values. - num_decimals (int): the number of decimals in the - meta-prompt. - max_num_pairs (int): the maximum number of exemplars in the meta-prompt. - - Returns: - meta_prompt (str): the generated meta-prompt. - """ - old_value_pairs = list(old_value_pairs_set) - old_value_pairs = sorted(old_value_pairs, key=lambda x: -x[1])[ - -max_num_pairs: - ] - old_value_pairs_substr = "" - for trace, dis in old_value_pairs: - old_value_pairs_substr += f"\n {trace} \nlength:\n{dis}\n" - meta_prompt = "You are given a list of points with coordinates below:\n" - for i, (xi, yi) in enumerate(zip(x, y)): - if i: - meta_prompt += ", " - meta_prompt += f"({i}): ({xi}, {yi})" - meta_prompt += ".\n\nBelow are some previous traces and their lengths. The traces are arranged in descending order based on their lengths, where lower values are better.".strip() - meta_prompt += "\n\n" - meta_prompt += old_value_pairs_substr.strip() - meta_prompt += "\n\n" - meta_prompt += """Give me a new trace that is different from all traces above, and has a length lower than any of the above. The trace should traverse all points exactly once. The trace should start with '' and end with . - """.strip() - return meta_prompt - - def extract_string(input_string): - start_string = "" - end_string = "" - if start_string not in input_string: - return "" - input_string = input_string[input_string.index(start_string) + len(start_string):] - if end_string not in input_string: - return "" - input_string = input_string[:input_string.index(end_string)] - parsed_list = [] - for p in input_string.split(","): - p = p.strip() - try: - p = int(p) - except: - continue - parsed_list.append(p) - return parsed_list - - # ================= generate the ground truth trace ===================== - - x = np.random.uniform(low=-100, high=100, size=num_points) - y = np.random.uniform(low=-100, high=100, size=num_points) - x = [np.round(xi, num_decimals) if num_decimals > 0 else int(xi) for xi in x] - y = [np.round(yi, num_decimals) if num_decimals > 0 else int(yi) for yi in y] - - starting_algorithm = _START_ALGORITHM.value - - gt_sol, min_dis = solve_tsp(x, y, num_points, num_decimals, starting_algorithm) - print("ground truth solution" + str(gt_sol)) - print("min distance: ", min_dis) - gt_sol_str = ",".join([str(i) for i in gt_sol]) - point_list = range(num_points) - init_sols = [] - while len(init_sols) < num_starting_points: - sol = np.random.permutation(point_list) - if sol[0] != 0: - continue - sol_str = ",".join([str(i) for i in sol]) - if sol_str == gt_sol_str: - continue - init_sols.append(list(sol)) - - # ====================== run optimization ============================ - configs_dict = { - "num_starting_points": num_starting_points, - "num_decode_per_step": num_decode_per_step, - "optimizer_llm_configs": optimizer_llm_dict, - "data": { - "ground truth solution": [",".join([str(i) for i in gt_sol])], - "loss_at_true_values": min_dis, - "x": list(x), - "y": list(y), - }, - "init_sols": [",".join([str(i) for i in sol]) for sol in init_sols], - "num_steps": num_steps, - "max_num_pairs": max_num_pairs, - "num_decimals": num_decimals, - } - configs_json_path = os.path.join(save_folder, "configs.json") - print(f"saving configs to\n{configs_json_path}") - with open(configs_json_path, "w") as f: - json.dump(configs_dict, f, indent=4) - - old_value_pairs_set = set() - old_value_pairs_with_i_step = [] # format: [(trace, dis = f(trace), i_step)] - meta_prompts_dict = dict() # format: {i_step: meta_prompt} - raw_outputs_dict = dict() # format: {i_step: raw_outputs} - - for sol in init_sols: - dis = evaluate_distance(x, y, sol, num_decimals) - sol_str = ",".join([str(i) for i in sol]) - old_value_pairs_set.add((sol_str, dis)) - old_value_pairs_with_i_step.append((sol_str, dis, -1)) - - print("\n================ run optimization ==============") - print(f"initial points: {[tuple(item[:-1]) for item in old_value_pairs_set]}") - print(f"initial values: {[item[-1] for item in old_value_pairs_set]}") - results_json_path = os.path.join(save_folder, "results.json") - print(f"saving results to\n{results_json_path}") - - for i_step in range(num_steps): - print(f"\nStep {i_step}:") - meta_prompt = gen_meta_prompt( - old_value_pairs_set, - x, - y, - max_num_pairs=max_num_pairs, - ) - print("\n=================================================") - print(f"meta_prompt:\n{meta_prompt}") - meta_prompts_dict[i_step] = meta_prompt - raw_outputs = [] - parsed_outputs = [] - while len(parsed_outputs) < num_decode_per_step: - raw_output = call_optimizer_server_func(meta_prompt) - for string in raw_output: - print("\n=================================================") - print("raw output:\n", string) - try: - parsed_output = extract_string(string) - if parsed_output is not None and len(set(parsed_output)) == num_points and len(parsed_output) == num_points and parsed_output[0] == 0: - dis = evaluate_distance(x, y, parsed_output, num_decimals) - if dis == -1: - continue - parsed_outputs.append(parsed_output) - raw_outputs.append(string) - except: - pass - print("\n=================================================") - print(f"proposed points: {parsed_outputs}") - raw_outputs_dict[i_step] = raw_outputs - - # evaluate the values of proposed and rounded outputs - single_step_values = [] - for trace in parsed_outputs: - dis = evaluate_distance(x, y, trace, num_decimals) - single_step_values.append(dis) - trace_str = ",".join([str(i) for i in trace]) - old_value_pairs_set.add((trace_str, dis)) - old_value_pairs_with_i_step.append((trace_str, dis, i_step)) - print(f"single_step_values: {single_step_values}") - print("ground truth solution" + str(gt_sol)) - print("min distance: ", min_dis) - - # ====================== save results ============================ - results_dict = { - "meta_prompts": meta_prompts_dict, - "raw_outputs": raw_outputs_dict, - "old_value_pairs_with_i_step": old_value_pairs_with_i_step, - } - with open(results_json_path, "w") as f: - json.dump(results_dict, f, indent=4) - - -if __name__ == "__main__": - app.run(main) diff --git a/optimization/test.py b/optimization/test.py deleted file mode 100644 index a38abcc..0000000 --- a/optimization/test.py +++ /dev/null @@ -1,967 +0,0 @@ -# Copyright 2023 The OPRO Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -r"""The .py file for prompt optimization. - -Usage: - -Step 1: edit the starting instructions by modifying `initial_instructions` - -Step 2: edit the training ratio by modifying `train_ratio` - -Step 3: check if the model configs (like batch size) are the same as the actual serving configs - -Step 4: run - -``` -python optimize_instructions.py \ - --optimizer="gpt-3.5-turbo" --scorer="text-bison" \ - --instruction_pos="A_begin" --dataset="gsm8k" --task="train" -``` - -The outputs will then be written to `outputs/optimization-results/` in the opro folder. - -Notes: - -1. One or more API keys may need to be provided: -- When using a Google-Cloud-served model (like text-bison at https://developers.generativeai.google/tutorials/text_quickstart), add `--palm_api_key=` -- When using an OpenAI model, add `--openai_api_key=””` - -2. The initial instructions should be provided in the "initial_instructions" -variable. -""" - -import datetime -import functools -import os -import sys - -OPRO_ROOT_PATH = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -) -sys.path.insert(0, OPRO_ROOT_PATH) - -from absl import app -from absl import flags -import google.generativeai as palm -import numpy as np -import openai -from opro import prompt_utils -from opro.optimization import opt_utils -import pandas as pd - -ROOT_DATA_FOLDER_PATH = os.path.join(OPRO_ROOT_PATH, "data") - -_LOCAL_MODEL_PATH = flags.DEFINE_string("local_model_path", None, "Path to local vLLM model.") - - -_OPENAI_API_KEY = flags.DEFINE_string( - "openai_api_key", "", "The OpenAI API key." -) - -_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.") - -_SCORER = flags.DEFINE_string( - "scorer", "text-bison", "The name of the scorer LLM." -) - -_OPTIMIZER = flags.DEFINE_string( - "optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM." -) - -_DATASET = flags.DEFINE_string( - "dataset", "gsm8k", "The name of dataset to search for instructions on." -) - -_TASK = flags.DEFINE_string( - "task", - "train", - "The name of task within the above dataset to search for instructions on.", -) - -_INSTRUCTION_POS = flags.DEFINE_string( - "instruction_pos", - "A_begin", - "The position of the instruction to search for.", -) - -_META_PROMPT_TYPE = flags.DEFINE_string( - "meta_prompt_type", - "both_instructions_and_exemplars", - "The type of meta-prompt: whether to have both previous instructions and" - " dataset exemplars (often for fine-tuned optimizers), or to have only" - " previous instructions (often for pre-trained optimizers).", -) - - -def main(_): - openai_api_key = _OPENAI_API_KEY.value - palm_api_key = _PALM_API_KEY.value - scorer_llm_name = _SCORER.value - optimizer_llm_name = _OPTIMIZER.value - dataset_name = _DATASET.value.lower() - task_name = _TASK.value - meta_prompt_type = _META_PROMPT_TYPE.value - local_model_path = _LOCAL_MODEL_PATH.value - - assert dataset_name in { - "mmlu", - "bbh", - "gsm8k", - }, "The lower-case dataset name must be one of mmlu, bbh, or gsm8k." - if dataset_name == "mmlu": - assert task_name in { - "STEM", - "humanities", - "social sciences", - "otheran (business, health, misc.)", - } # for now only support searching on one MMLU category - elif dataset_name == "bbh": - assert task_name in { - "boolean_expressions", - "causal_judgement", - "date_understanding", - "disambiguation_qa", - "dyck_languages", - "formal_fallacies", - "geometric_shapes", - "hyperbaton", - "logical_deduction_five_objects", - "logical_deduction_seven_objects", - "logical_deduction_three_objects", - "movie_recommendation", - "multistep_arithmetic_two", - "navigate", - "object_counting", - "penguins_in_a_table", - "reasoning_about_colored_objects", - "ruin_names", - "salient_translation_error_detection", - "snarks", - "sports_understanding", - "temporal_sequences", - "tracking_shuffled_objects_five_objects", - "tracking_shuffled_objects_seven_objects", - "tracking_shuffled_objects_three_objects", - "web_of_lies", - "word_sorting", - } - else: - assert dataset_name == "gsm8k" - assert task_name in {"train", "test"} - - assert scorer_llm_name in { - "text-bison", - "gpt-3.5-turbo", - "gpt-4", - "local", - } - assert optimizer_llm_name in { - "text-bison", - "gpt-3.5-turbo", - "gpt-4", - "local", - } - assert meta_prompt_type in { - "both_instructions_and_exemplars", - "instructions_only", - } - - instruction_pos = _INSTRUCTION_POS.value - assert instruction_pos in { - "before_Q", - "Q_begin", - "Q_end", - "A_begin", - }, ( - "The instruction position should be either before the question, or at the" - " beginning of the question, at the end of the question, or at the" - " beginning of the answer." - ) - print( - f"scorer: {scorer_llm_name}, optimizer: {optimizer_llm_name}, dataset:" - f" {dataset_name}, task: {task_name}, instruction_pos: {instruction_pos}" - ) - - # make sure the scorer and optimizer models are callable - - if scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}: - assert openai_api_key, "The OpenAI API key must be provided." - openai.api_key = openai_api_key - elif scorer_llm_name == "text-bison": - assert scorer_llm_name == "text-bison" - assert ( - palm_api_key - ), "A PaLM API key is needed when prompting the text-bison model." - palm.configure(api_key=palm_api_key) - - elif scorer_llm_name == "local": - assert local_model_path, "The local model path must be provided." - assert os.path.exists(local_model_path), ( - f"The local model path {local_model_path} does not exist." - ) - # set the local model path for vLLM - # prompt_utils.call_local_server_func(local_model_path) - else: - raise ValueError( - f"Unknown scorer_llm_name: {scorer_llm_name}. " - "It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local." - ) - - if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}: - assert openai_api_key, "The OpenAI API key must be provided." - openai.api_key = openai_api_key - elif optimizer_llm_name == "text-bison": - assert optimizer_llm_name == "text-bison" - assert ( - palm_api_key - ), "A PaLM API key is needed when prompting the text-bison model." - palm.configure(api_key=palm_api_key) - - elif optimizer_llm_name == "local": - assert local_model_path, "The local model path must be provided." - assert os.path.exists(local_model_path), ( - f"The local model path {local_model_path} does not exist." - ) - # set the local model path for vLLM - # prompt_utils.call_local_server_func(local_model_path) - else: - raise ValueError( - f"Unknown scorer_llm_name: {optimizer_llm_name}. " - "It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local." - ) - - - if dataset_name == "mmlu": - root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "MMLU-data") - elif dataset_name == "bbh": - root_data_folder_path = os.path.join( - ROOT_DATA_FOLDER_PATH, "BIG-Bench-Hard-data/" - ) - else: - assert dataset_name == "gsm8k" - root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "gsm_data") - - # =================== create the result directory ========================== - datetime_str = ( - str(datetime.datetime.now().replace(microsecond=0)) - .replace(" ", "-") - .replace(":", "-") - ) - - save_folder = os.path.join( - OPRO_ROOT_PATH, - "outputs", - "optimization-results", - f"{dataset_name.upper()}-{task_name}-s-{scorer_llm_name}-o-{optimizer_llm_name}-{datetime_str}/", - ) - result_by_instruction_folder = os.path.join( - save_folder, "result_by_instruction" - ) - os.makedirs(result_by_instruction_folder) - print(f"result directory:\n{save_folder}") - - # ====================== scorer model configs ============================== - # difference between num_decodes and batch_size: - # - num_decodes: how many outputs we actually want for each input - # - batch_size: the batch size in model serving, should equal to that in - # model serving config - # 常量定义 - DEFAULT_MAX_TOKENS = 1024 - DEFAULT_TEMPERATURE = 0.0 - PALM_MODEL_NAME = "text-bison-001" - - if scorer_llm_name == "text-bison": - config = { - "temperature": DEFAULT_TEMPERATURE, - "max_decode_steps": DEFAULT_MAX_TOKENS, - "batch_size": 1, - "num_servers": 1, - } - call_scorer_server_func = functools.partial( - prompt_utils.call_palm_server_from_cloud, - model=PALM_MODEL_NAME, - **config - ) - scorer_llm_dict = {"model_type": "text-bison", **config} - - elif scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}: - config = { - "temperature": DEFAULT_TEMPERATURE, - "max_decode_steps": DEFAULT_MAX_TOKENS, - "batch_size": 1, - "num_servers": 1, - } - call_scorer_server_func = functools.partial( - prompt_utils.call_openai_server_func, - model=scorer_llm_name.lower(), - **config - ) - scorer_llm_dict = {"model_type": scorer_llm_name.lower(), **config} - - elif scorer_llm_name == "local": - print(f"[DEBUG] local_model_path: {local_model_path}") - assert local_model_path, "Local model path must be provided." - config = { - "temperature": DEFAULT_TEMPERATURE, - "max_decode_steps": DEFAULT_MAX_TOKENS, - "batch_size": 8, - "num_servers": 8,# number of servers to use for local model - } - call_scorer_server_func = functools.partial( - prompt_utils.call_local_server_func, - local_model_path=local_model_path, - **config - ) - scorer_llm_dict = {"model_type": "local", **config} - - else: - raise ValueError(f"Unsupported model: {scorer_llm_name}") - -# if scorer_llm_name == "text-bison": -# # when prompting text-bison with Cloud API -# scorer_finetuned_palm_temperature = 0.0 -# scorer_finetuned_palm_max_decode_steps = 1024 -# scorer_finetuned_palm_batch_size = 1 -# scorer_finetuned_palm_num_servers = 1 -# scorer_finetuned_palm_dict = dict() -# scorer_finetuned_palm_dict["temperature"] = ( -# scorer_finetuned_palm_temperature -# ) -# scorer_finetuned_palm_dict["num_servers"] = ( -# scorer_finetuned_palm_num_servers -# ) -# scorer_finetuned_palm_dict["batch_size"] = scorer_finetuned_palm_batch_size -# scorer_finetuned_palm_dict["max_decode_steps"] = ( -# scorer_finetuned_palm_max_decode_steps -# ) - -# call_scorer_finetuned_palm_server_func = functools.partial( -# prompt_utils.call_palm_server_from_cloud, -# model="text-bison-001", -# temperature=scorer_finetuned_palm_dict["temperature"], -# max_decode_steps=scorer_finetuned_palm_dict["max_decode_steps"], -# ) - -# scorer_llm_dict = { -# "model_type": scorer_llm_name.lower(), -# } -# scorer_llm_dict.update(scorer_finetuned_palm_dict) -# call_scorer_server_func = call_scorer_finetuned_palm_server_func - -# elif scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}: -# # assert scorer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4"} -# scorer_gpt_max_decode_steps = 1024 -# scorer_gpt_temperature = 0.0 - -# scorer_gpt_dict = dict() -# scorer_gpt_dict["max_decode_steps"] = scorer_gpt_max_decode_steps -# scorer_gpt_dict["temperature"] = scorer_gpt_temperature -# scorer_gpt_dict["num_decodes"] = 1 -# scorer_gpt_dict["batch_size"] = 1 -# scorer_gpt_dict["num_servers"] = 1 - -# scorer_llm_dict = { -# "model_type": scorer_llm_name.lower(), -# } -# scorer_llm_dict.update(scorer_gpt_dict) -# call_scorer_server_func = functools.partial( -# prompt_utils.call_openai_server_func, -# model=scorer_llm_name.lower(), -# max_decode_steps=scorer_gpt_max_decode_steps, -# temperature=scorer_gpt_temperature, -# ) -# elif scorer_llm_name == "local": -# # local vLLM model -# scorer_local_max_decode_steps = 1024 -# scorer_local_temperature = 0.0 -# call_scorer_server_func = functools.partial( -# prompt_utils.call_local_model_server_func, -# model_path=local_model_path, -# max_decode_steps=scorer_local_max_decode_steps, -# temperature=scorer_local_temperature, -# ) -# else: -# raise ValueError( -# f"Unknown scorer_llm_name: {scorer_llm_name}. " -# "It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local." -# ) - - # ====================== optimizer model configs ============================ - if optimizer_llm_name.lower() == "text-bison": - # PaLM text-bison 模型配置 - optimizer_llm_dict = { - "model_type": "text-bison", - "temperature": 1.0, # 更高的随机性以生成多样化解 - "max_decode_steps": 1024, # 最大生成长度 - "batch_size": 1, # 单样本处理 - "num_decodes": 8, # 生成8个候选结果 - "num_servers": 1 # 单服务器 - } - - call_optimizer_server_func = functools.partial( - prompt_utils.call_palm_server_from_cloud, - model="text-bison-001", - temperature=optimizer_llm_dict["temperature"], - max_decode_steps=optimizer_llm_dict["max_decode_steps"], - ) - - elif optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4"}: - # GPT 模型配置 - optimizer_llm_dict = { - "model_type": optimizer_llm_name.lower(), - "temperature": 1.0, # 更高的随机性 - "max_decode_steps": 512, # 较短的最大长度 - "batch_size": 1, - "num_decodes": 1 , # 单次生成 - "num_servers": 1 # 单服务器 - } - - call_optimizer_server_func = functools.partial( - prompt_utils.call_openai_server_func, - model=optimizer_llm_name, - max_decode_steps=optimizer_llm_dict["max_decode_steps"], - temperature=optimizer_llm_dict["temperature"], - ) - elif optimizer_llm_name.lower() == "local": - assert local_model_path, "Local model path must be provided." - optimizer_llm_dict = { - "model_type": optimizer_llm_name.lower(), - "temperature": 1.0, # 更高的随机性 - "max_decode_steps": 512, # 较短的最大长度 - "batch_size": 8, - "num_decodes": 1 , # 单次生成 - "num_servers": 8 # 单服务器 - } - call_optimizer_server_func = functools.partial( - prompt_utils.call_local_server_func, - local_model_path=local_model_path, - max_decode_steps=optimizer_llm_dict["max_decode_steps"], - temperature=optimizer_llm_dict["temperature"], - ) - - else: - raise ValueError( - f"Unsupported optimizer model: {optimizer_llm_name}. " - "Must be one of: text-bison, gpt-3.5-turbo, gpt-4" - ) - -# if optimizer_llm_name.lower() == "text-bison": -# # when prompting text-bison with Cloud API -# optimizer_finetuned_palm_temperature = 1.0 -# optimizer_finetuned_palm_num_decodes = 8 -# optimizer_finetuned_palm_max_decode_steps = 1024 -# optimizer_finetuned_palm_batch_size = 1 -# optimizer_finetuned_palm_num_servers = 1 -# optimizer_finetuned_palm_dict = dict() -# optimizer_finetuned_palm_dict["temperature"] = ( -# optimizer_finetuned_palm_temperature -# ) -# optimizer_finetuned_palm_dict["num_decodes"] = ( -# optimizer_finetuned_palm_num_decodes -# ) -# optimizer_finetuned_palm_dict["batch_size"] = ( -# optimizer_finetuned_palm_batch_size -# ) -# optimizer_finetuned_palm_dict["num_servers"] = ( -# optimizer_finetuned_palm_num_servers -# ) -# optimizer_finetuned_palm_dict["max_decode_steps"] = ( -# optimizer_finetuned_palm_max_decode_steps -# ) - -# call_optimizer_finetuned_palm_server_func = functools.partial( -# prompt_utils.call_palm_server_from_cloud, -# model="text-bison-001", -# temperature=optimizer_finetuned_palm_dict["temperature"], -# max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"], -# ) - -# optimizer_llm_dict = { -# "model_type": optimizer_llm_name.lower(), -# } -# optimizer_llm_dict.update(optimizer_finetuned_palm_dict) -# call_optimizer_server_func = call_optimizer_finetuned_palm_server_func - -# else: -# assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"} -# optimizer_gpt_max_decode_steps = 512 -# optimizer_gpt_temperature = 1.0 - -# optimizer_llm_dict = dict() -# optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps -# optimizer_llm_dict["temperature"] = optimizer_gpt_temperature -# optimizer_llm_dict["batch_size"] = 1 -# optimizer_llm_dict["num_decodes"] = 1 -# call_optimizer_server_func = functools.partial( -# prompt_utils.call_openai_server_func, -# model=optimizer_llm_name, -# max_decode_steps=optimizer_gpt_max_decode_steps, -# temperature=optimizer_gpt_temperature, -# ) - - # ====================== try calling the servers ============================ - print("\n======== testing the scorer and optimizer servers ===========") - scorer_test_output = call_scorer_server_func( - "Does the sun rise from the north? Just answer yes or no." - ) - print(f"number of scorer output decodes: {len(scorer_test_output)}") - print(f"scorer test output: {scorer_test_output}") - optimizer_test_output = call_optimizer_server_func( - "Does the sun rise from the north? Just answer yes or no.", - temperature=1.0, - ) - print(f"number of optimizer output decodes: {len(optimizer_test_output)}") - print(f"optimizer test output: {optimizer_test_output}") - print("Finished testing the servers.") - - # ====================== read data ============================ - print("\n================ prompt optimization settings ==============") - # from https://github.com/hendrycks/test/blob/master/categories.py - subcategories = { - "abstract_algebra": ["math"], - "anatomy": ["health"], - "astronomy": ["physics"], - "business_ethics": ["business"], - "clinical_knowledge": ["health"], - "college_biology": ["biology"], - "college_chemistry": ["chemistry"], - "college_computer_science": ["computer science"], - "college_mathematics": ["math"], - "college_medicine": ["health"], - "college_physics": ["physics"], - "computer_security": ["computer science"], - "conceptual_physics": ["physics"], - "econometrics": ["economics"], - "electrical_engineering": ["engineering"], - "elementary_mathematics": ["math"], - "formal_logic": ["philosophy"], - "global_facts": ["other"], - "high_school_biology": ["biology"], - "high_school_chemistry": ["chemistry"], - "high_school_computer_science": ["computer science"], - "high_school_european_history": ["history"], - "high_school_geography": ["geography"], - "high_school_government_and_politics": ["politics"], - "high_school_macroeconomics": ["economics"], - "high_school_mathematics": ["math"], - "high_school_microeconomics": ["economics"], - "high_school_physics": ["physics"], - "high_school_psychology": ["psychology"], - "high_school_statistics": ["math"], - "high_school_us_history": ["history"], - "high_school_world_history": ["history"], - "human_aging": ["health"], - "human_sexuality": ["culture"], - "international_law": ["law"], - "jurisprudence": ["law"], - "logical_fallacies": ["philosophy"], - "machine_learning": ["computer science"], - "management": ["business"], - "marketing": ["business"], - "medical_genetics": ["health"], - "miscellaneous": ["other"], - "moral_disputes": ["philosophy"], - "moral_scenarios": ["philosophy"], - "nutrition": ["health"], - "philosophy": ["philosophy"], - "prehistory": ["history"], - "professional_accounting": ["other"], - "professional_law": ["law"], - "professional_medicine": ["health"], - "professional_psychology": ["psychology"], - "public_relations": ["politics"], - "security_studies": ["politics"], - "sociology": ["culture"], - "us_foreign_policy": ["politics"], - "virology": ["health"], - "world_religions": ["philosophy"], - } - - categories = { - "STEM": [ - "physics", - "chemistry", - "biology", - "computer science", - "math", - "engineering", - ], - "humanities": ["history", "philosophy", "law"], - "social sciences": [ - "politics", - "culture", - "economics", - "geography", - "psychology", - ], - "other (business, health, misc.)": ["other", "business", "health"], - } - - if dataset_name == "mmlu": - # EITHER: filter by category - # category_names = [ - # "STEM", - # "humanities", - # "social sciences", - # "other (business, health, misc.)", - # ] - category_names = [task_name] - folder_name = "test" # one of {'auxiliary_train', 'dev', 'val', 'test'} - task_names = [] - for task_csv_name in os.listdir( - os.path.join(root_data_folder_path, folder_name) - ): - task_names.append(task_csv_name.split(".")[0]) - - tasks_in_category = [] - for category_name in category_names: - for task_name in task_names: - for subname in subcategories: - if subname in task_name: - if subcategories[subname][0] in categories[category_name]: - tasks_in_category.append(task_name) - break - - tasks_all = [(folder_name, task_name) for task_name in tasks_in_category] - multiple_choice_tasks = set([item[1] for item in tasks_all]) - boolean_tasks = set() - numerical_output_tasks = set() - - # OR: filter by task - # tasks_all = [ - # # ('test', 'abstract_algebra_test'), - # # ('test', 'college_computer_science_test'), - # # ('test', 'college_mathematics_test'), - # # ('test', 'college_physics_test'), - # # ('test', 'elementary_mathematics_test'), - # # ('test', 'global_facts_test'), - # # ('test', 'high_school_physics_test'), - # # ('test', 'machine_learning_test'), - # # ('test', 'management_test'), - # # ('test', 'medical_genetics_test'), - # # ('test', 'moral_scenarios_test'), - # # ('test', 'professional_psychology_test'), - # # ('test', 'public_relations_test'), - # # ('test', 'professional_law_test'), - # # ('test', 'high_school_psychology_test'), - # # ('test', 'high_school_world_history_test'), - # # ('test', 'human_aging_test'), - # # ('test', 'miscellaneous_test'), - # # ('test', 'moral_scenarios_test'), - # ('test', 'professional_psychology_test'), - # # ('test', 'security_studies_test'), - # ] - - elif dataset_name == "bbh": - tasks_all = [task_name] - assert ( - len(tasks_all) == 1 - ), "for now only support prompt optimization on one BBH task" - - # all BBH tasks are as below - # tasks_all = [ - # 'boolean_expressions', - # 'causal_judgement', - # 'date_understanding', - # 'disambiguation_qa', - # 'dyck_languages', - # 'formal_fallacies', - # 'geometric_shapes', - # 'hyperbaton', - # 'logical_deduction_five_objects', - # 'logical_deduction_seven_objects', - # 'logical_deduction_three_objects', - # 'movie_recommendation', - # 'multistep_arithmetic_two', - # 'navigate', - # 'object_counting', - # 'penguins_in_a_table', - # 'reasoning_about_colored_objects', - # 'ruin_names', - # 'salient_translation_error_detection', - # 'snarks', - # 'sports_understanding', - # 'temporal_sequences', - # 'tracking_shuffled_objects_five_objects', - # 'tracking_shuffled_objects_seven_objects', - # 'tracking_shuffled_objects_three_objects', - # 'web_of_lies', - # 'word_sorting' - # ] - numerical_output_tasks = { - "object_counting", - "multistep_arithmetic_two", - } - - multiple_choice_tasks = { - "date_understanding", - "disambiguation_qa", - "geometric_shapes", - "hyperbaton", - "logical_deduction_five_objects", - "logical_deduction_seven_objects", - "logical_deduction_three_objects", - "movie_recommendation", - "penguins_in_a_table", - "reasoning_about_colored_objects", - "ruin_names", - "salient_translation_error_detection", - "snarks", - "temporal_sequences", - "tracking_shuffled_objects_five_objects", - "tracking_shuffled_objects_seven_objects", - "tracking_shuffled_objects_three_objects", - } - - boolean_tasks = { - "boolean_expressions", # True or False - "causal_judgement", # yes or no - "formal_fallacies", # valid or invalid - "navigate", # yes or no - "sports_understanding", # yes or no - "web_of_lies", # yes or no - } - - else: - assert dataset_name in {"gsm8k"} - tasks_all = [task_name] - multiple_choice_tasks = set() - boolean_tasks = set() - numerical_output_tasks = set(tasks_all) - - if dataset_name == "mmlu": - raw_data = pd.DataFrame() - prediction_treat_as_number = False - prediction_treat_as_bool = False - elif dataset_name == "bbh": - raw_data = [] - prediction_treat_as_number = bool( - tasks_all[0] in numerical_output_tasks - ) # for now only check the first task - prediction_treat_as_bool = bool( - tasks_all[0] in boolean_tasks - ) # for now only check the first task - print( - f"prediction_treat_as_number: {prediction_treat_as_number}," - f" prediction_treat_as_bool: {prediction_treat_as_bool}" - ) - else: - assert dataset_name == "gsm8k" - raw_data = pd.DataFrame() - prediction_treat_as_number = True - prediction_treat_as_bool = False - - for t in tasks_all: - if dataset_name == "mmlu": - folder_name = t[0] - task_name = t[1] - single_task_df = pd.read_csv( - os.path.join(root_data_folder_path, f"{folder_name}/{task_name}.csv"), - index_col=None, - header=None, - ) - raw_data = pd.concat([raw_data, single_task_df]) - elif dataset_name == "bbh": - task_name = t - single_task_list = opt_utils.load_bbh_task_data( - task_name, base_dir=root_data_folder_path - ) - raw_data += single_task_list - else: - assert dataset_name == "gsm8k" - task_name = t - f_gsm = os.path.join(root_data_folder_path, f"gsm_{task_name}.tsv") - single_task_df = pd.read_csv(f_gsm, sep="\t", header=None) - raw_data = pd.concat([raw_data, single_task_df]) - - if dataset_name == "mmlu": - num_examples = raw_data.shape[0] - elif dataset_name == "bbh": - num_examples = len(raw_data) - else: - assert dataset_name in {"gsm8k"} - num_examples = raw_data.shape[0] - print(f"number of examples in the current task: {num_examples}") - - # ================ split data into train/val/test ========================== - if dataset_name == "mmlu": - train_ratio = 0.8 - eval_ratio = 0.2 - elif dataset_name == "gsm8k": - train_ratio = 0.035 - eval_ratio = 0 - else: - assert dataset_name == "bbh" - train_ratio = 0.2 - eval_ratio = 0 - - # train-validation-test split - # It is important to sort the indices, as this ensures the is_multiple_choice - # Boolean variables match the data points. - assert train_ratio + eval_ratio <= 1 - test_ratio = 1 - train_ratio - eval_ratio - print( - f"train_ratio: {train_ratio}, eval_ratio: {eval_ratio}, " - f"test_ratio: {test_ratio}" - ) - np.random.seed(0) - train_index = np.sort( - np.array( - np.random.choice( - num_examples, size=int(train_ratio * num_examples), replace=False - ) - ) - ) - eval_and_test_index = np.sort( - np.array(list(set(np.arange(num_examples)) - set(train_index))) - ) - eval_index = np.sort( - np.array( - np.random.choice( - eval_and_test_index, - size=int(eval_ratio * num_examples), - replace=False, - ) - ) - ) - - # ========== set other optimization experiment hyperparameters ============== - if scorer_llm_name == "text-bison": - old_instruction_score_threshold = 0.0 # 完全保留旧指令 表示不过滤任何历史指令(即使质量很低的旧指令也会保留)。 - # old_instruction_score_threshold = 0.15 # for GSM8K - elif scorer_llm_name == "local": - old_instruction_score_threshold = 0.3 - else: - assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"} # 模型校验 - old_instruction_score_threshold = 0.3 # 过滤低质量旧指令 - - if scorer_llm_name == "text-bison": - extract_final_answer_by_prompting_again = False # 是否通过二次提示提取最终答案(例如从冗长响应中提取关键内容) - include_qa = False # 是否在元提示中包含问答对 - evaluate_in_parallel = False # 是否并行评估 - elif scorer_llm_name == "local": - extract_final_answer_by_prompting_again = True - include_qa = True - evaluate_in_parallel = True - else: - assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"} - extract_final_answer_by_prompting_again = False - include_qa = False - evaluate_in_parallel = False - - optimizer_llm_temperature = optimizer_llm_dict["temperature"] - - num_few_shot_questions_for_instruction_refinement = 3 # number of few-shot questions 每次优化指令时参考的少样本示例数量(Few-shot QA对)。 - - # To change the number of generated instructions in each step, one should - # edit the value of the variable below, instead of editing the number of - # decodes in model parameters, because those values are limited by model - # serving configs. - num_generated_instructions_in_each_step = 3 # number of generated instructions in each step 每轮搜索生成的候选指令数量。 - num_search_steps = 50 # number of search steps 总优化迭代次数。 - - initial_instructions = [ - "Let's solve the problem.", - # "", - # "The answer is", - ] - few_shot_qa_pairs = True #是否使用少样本示例指导指令生成。 - # one of {'accumulative_most_frequent', 'current_most_frequent', 'random', - # 'constant'} - few_shot_selection_criteria = "random" #对多样性要求高时用 random,稳定性要求高时用 most_frequent。 - # whether to evaluate generated instructions on the exemplars in meta-prompt - evaluate_generated_ins_on_few_shot = False # 是否评估新指令 开发阶段设为 True调试指令质量。 - # whether to evaluate old instructions on the exemplars in the meta-prompt - evaluate_old_ins_on_few_shot = False # 是否评估旧指令 生产阶段设为 False加速运行。 - # every this number of steps, compute the accuracies of current-step - # instructions on the validation set - eval_interval = 3 # 每N步在验证集上测试当前指令的准确率。 - - max_num_instructions = ( - 20 # 元提示中保留的历史指令数量上限。 - ) - # 将连续分数离散化为N档(如0-100整数),简化模型理解。 - num_score_buckets = 100 - # whether to put old instructions and scores to before exemplars in - # 控制元提示中历史指令和少样本示例的顺序。 - meta_prompt_instructions_before_exemplars = True - - # ===================== run prompt optimization ====================== - - assert few_shot_selection_criteria in { - "accumulative_most_frequent", - "current_most_frequent", - "random", - "constant", - } - evolution_kwargs = { - "num_search_steps": num_search_steps, - "old_instruction_score_threshold": old_instruction_score_threshold, - "scorer_llm_dict": scorer_llm_dict, - "optimizer_llm_dict": optimizer_llm_dict, - "extract_final_answer_by_prompting_again": ( - extract_final_answer_by_prompting_again - ), - "include_qa": include_qa, - "evaluate_in_parallel": evaluate_in_parallel, - "tasks_all": tasks_all, - "train_ratio": train_ratio, - "eval_ratio": eval_ratio, - "test_ratio": test_ratio, - "train_index": train_index, - "eval_index": eval_index, - "dataset_name": dataset_name, - "task_name": task_name, - "num_examples": num_examples, - "root_data_folder_path": root_data_folder_path, - "optimizer_llm_temperature": optimizer_llm_temperature, - # "optimizer_llm_temperature_schedule": ( - # optimizer_llm_temperature_schedule - # ), - # "optimizer_llm_temperature_end": optimizer_llm_temperature_end, - "initial_instructions": initial_instructions, - "multiple_choice_tasks": multiple_choice_tasks, - "raw_data": raw_data, - "call_scorer_server_func": call_scorer_server_func, - "call_optimizer_server_func": call_optimizer_server_func, - "instruction_pos": instruction_pos, - "prediction_treat_as_number": prediction_treat_as_number, - "prediction_treat_as_bool": prediction_treat_as_bool, - "result_by_instruction_folder": result_by_instruction_folder, - "few_shot_qa_pairs": few_shot_qa_pairs, - "num_score_buckets": num_score_buckets, - "max_num_instructions": max_num_instructions, - "meta_prompt_type": meta_prompt_type, - "meta_prompt_instructions_before_exemplars": ( - meta_prompt_instructions_before_exemplars - ), - "few_shot_selection_criteria": few_shot_selection_criteria, - "optimizer_llm_name": optimizer_llm_name, - "num_generated_instructions_in_each_step": ( - num_generated_instructions_in_each_step - ), - "evaluate_generated_ins_on_few_shot": evaluate_generated_ins_on_few_shot, - "num_few_shot_questions_for_instruction_refinement": ( - num_few_shot_questions_for_instruction_refinement - ), - "evaluate_old_ins_on_few_shot": evaluate_old_ins_on_few_shot, - "eval_interval": eval_interval, - "save_folder": save_folder, - } - - opt_utils.run_evolution(**evolution_kwargs) - - -if __name__ == "__main__": - app.run(main) diff --git a/outputs/user_feedback.jsonl b/outputs/user_feedback.jsonl deleted file mode 100644 index e504227..0000000 --- a/outputs/user_feedback.jsonl +++ /dev/null @@ -1,2 +0,0 @@ -{"session_id": "06e84e4b03c54df188961f4e033f9e77", "round": 1, "choice": "哪些奶茶含有较少的糖分?", "answer": "选择低糖奶茶时,可以通过品牌选择、糖分选项调整以及配料搭配来控制摄入的糖分。以下是一些具体建议和推荐:\n\n---\n\n### **一、低糖奶茶品牌及选项**\n1. **喜茶(HEYTEA)** \n - **推荐系列**:轻乳茶(如「轻乳茉莉奶绿」)、部分限定款(如「芝士奶盖茶」可选低糖)。 \n - **糖分控制**:默认糖度为50%,可选「三分糖」(约10%糖度)或「无糖」(使用代糖)。 \n - **注意**:奶盖本身含糖,建议选择「无奶盖」选项。\n\n2. **奈雪的茶** \n - **推荐系列**:鲜果茶(如「霸气橙子」)、「鲜果轻乳茶」系列。 \n - **糖分控制**:默认糖度为50%,可选「少糖」(约30%)或「无糖」(使用代糖)。 \n - **注意**:鲜果茶的天然果糖含量较高,需结合糖分选项综合判断。\n\n3. **CoCo都可** \n - **推荐系列**:「鲜果茶」(如「百香果鲜果茶」)、「轻乳茶」。 \n - **糖分控制**:默认全糖(约100%),可选「三分糖」(约30%)或「无糖」(代糖)。 \n - **注意**:部分经典奶茶(如「珍珠奶茶」)糖分较高,建议选择鲜果类饮品。\n\n4. **一点点** \n - **推荐系列**:「鲜果茶」(如「百香果鲜果茶」)、「轻乳茶」。 \n - **糖分控制**:默认全糖(约100%),可选「半糖」(50%)或「无糖」(代糖)。 \n - **注意**:部分奶盖茶含糖量较高,建议选择鲜果或轻乳系列。\n\n---\n\n### **二、如何选择低糖奶茶?**\n1. **糖分选项优先级** \n - **无糖/代糖**:使用赤藓糖醇、甜菊糖等代糖,几乎不含热量(但需注意部分人对代糖敏感)。 \n - **三分糖/半糖**:糖分约为全糖的30%-50%,适合控制摄入。 \n - **避免全糖**:全糖奶茶的糖分通常在"} -{"session_id": "af370656137a4b8f97d8ffac2c60b191", "round": 1, "choice": "瑞幸咖啡有哪些值得推荐的饮品?", "answer": "瑞幸咖啡近年来凭借高性价比和创新口味,推出了许多受欢迎的饮品。以下是一些值得推荐的饮品,涵盖经典款、特色新品和季节限定,适合不同口味和需求:\n\n---\n\n### **一、爆款必试系列**\n1. **生椰拿铁** \n - **推荐理由**:瑞幸的“现象级”产品,椰香浓郁与咖啡的苦香完美融合,口感顺滑,常年占据销量榜首。 \n - **小贴士**:可尝试“生椰拿铁+巧克力糖浆”组合,变成“生椰摩卡”,风味更丰富。\n\n2. **丝绒拿铁** \n - **推荐理由**:以红茶为基底,搭配丝滑牛奶,茶香与奶香平衡,适合喜欢茶咖融合的人群。 \n - **特色**:选用锡兰红茶,口感更醇厚,冷热皆宜。\n\n3. **厚乳拿铁** \n - **推荐理由**:使用厚乳(高乳脂含量的牛奶),奶香更浓郁,适合追求绵密口感的爱好者。\n\n---\n\n### **二、果味与创意系列**\n1. **冰椰拿铁**(夏季限定) \n - **推荐理由**:生椰拿铁的冰饮版本,加入冰块和椰香糖浆,清爽解暑,适合夏天。\n\n2. **蓝莓生椰拿铁** \n - **推荐理由**:在生椰拿铁基础上加入蓝莓糖浆,果香与椰香交织,甜而不腻。\n\n3. **蜜桃生椰拿铁** \n - **推荐理由**:蜜桃风味糖浆与生椰拿1:1搭配,清新果香与咖啡的苦香碰撞,适合喜欢果味的人。\n\n---\n\n### **三、季节限定款**\n1. **桂花拿铁**(秋季限定) \n - **推荐理由**:桂花糖浆与拿铁结合,香气扑鼻,甜度适中,是"}