去掉无关代码
This commit is contained in:
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -1,736 +0,0 @@
|
|||||||
|
|
||||||
import datetime
|
|
||||||
import functools
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
OPRO_ROOT_PATH = os.path.dirname(
|
|
||||||
os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
|
|
||||||
)
|
|
||||||
sys.path.insert(0, OPRO_ROOT_PATH)
|
|
||||||
|
|
||||||
from absl import app
|
|
||||||
from absl import flags
|
|
||||||
import google.generativeai as palm
|
|
||||||
import numpy as np
|
|
||||||
import openai
|
|
||||||
from opro import prompt_utils
|
|
||||||
from opro.optimization import opt_utils
|
|
||||||
import pandas as pd
|
|
||||||
FLAGS = flags.FLAGS
|
|
||||||
ROOT_DATA_FOLDER_PATH = os.path.join(OPRO_ROOT_PATH, "data")
|
|
||||||
|
|
||||||
flags.DEFINE_string("local_model_path", "", "Path to local vLLM model.")
|
|
||||||
|
|
||||||
_OPENAI_API_KEY = flags.DEFINE_string(
|
|
||||||
"openai_api_key", "", "The OpenAI API key."
|
|
||||||
)
|
|
||||||
|
|
||||||
_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
|
|
||||||
|
|
||||||
_SCORER = flags.DEFINE_string(
|
|
||||||
"scorer", "text-bison", "The name of the scorer LLM."
|
|
||||||
)
|
|
||||||
|
|
||||||
_OPTIMIZER = flags.DEFINE_string(
|
|
||||||
"optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
|
|
||||||
)
|
|
||||||
|
|
||||||
_DATASET = flags.DEFINE_string(
|
|
||||||
"dataset", "gsm8k", "The name of dataset to search for instructions on."
|
|
||||||
)
|
|
||||||
|
|
||||||
_TASK = flags.DEFINE_string(
|
|
||||||
"task",
|
|
||||||
"train",
|
|
||||||
"The name of task within the above dataset to search for instructions on.",
|
|
||||||
)
|
|
||||||
|
|
||||||
_INSTRUCTION_POS = flags.DEFINE_string(
|
|
||||||
"instruction_pos",
|
|
||||||
"A_begin",
|
|
||||||
"The position of the instruction to search for.",
|
|
||||||
)
|
|
||||||
|
|
||||||
_META_PROMPT_TYPE = flags.DEFINE_string(
|
|
||||||
"meta_prompt_type",
|
|
||||||
"both_instructions_and_exemplars",
|
|
||||||
"The type of meta-prompt: whether to have both previous instructions and"
|
|
||||||
" dataset exemplars (often for fine-tuned optimizers), or to have only"
|
|
||||||
" previous instructions (often for pre-trained optimizers).",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def main(_):
|
|
||||||
local_model_path = FLAGS.local_model_path
|
|
||||||
openai_api_key = _OPENAI_API_KEY.value
|
|
||||||
palm_api_key = _PALM_API_KEY.value
|
|
||||||
scorer_llm_name = _SCORER.value
|
|
||||||
optimizer_llm_name = _OPTIMIZER.value
|
|
||||||
dataset_name = _DATASET.value.lower()
|
|
||||||
task_name = _TASK.value
|
|
||||||
meta_prompt_type = _META_PROMPT_TYPE.value
|
|
||||||
|
|
||||||
assert dataset_name in {
|
|
||||||
"mmlu",
|
|
||||||
"bbh",
|
|
||||||
"gsm8k",
|
|
||||||
}, "The lower-case dataset name must be one of mmlu, bbh, or gsm8k."
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
assert task_name in {
|
|
||||||
"STEM",
|
|
||||||
"humanities",
|
|
||||||
"social sciences",
|
|
||||||
"other (business, health, misc.)",
|
|
||||||
} # for now only support searching on one MMLU category
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
assert task_name in {
|
|
||||||
"boolean_expressions",
|
|
||||||
"causal_judgement",
|
|
||||||
"date_understanding",
|
|
||||||
"disambiguation_qa",
|
|
||||||
"dyck_languages",
|
|
||||||
"formal_fallacies",
|
|
||||||
"geometric_shapes",
|
|
||||||
"hyperbaton",
|
|
||||||
"logical_deduction_five_objects",
|
|
||||||
"logical_deduction_seven_objects",
|
|
||||||
"logical_deduction_three_objects",
|
|
||||||
"movie_recommendation",
|
|
||||||
"multistep_arithmetic_two",
|
|
||||||
"navigate",
|
|
||||||
"object_counting",
|
|
||||||
"penguins_in_a_table",
|
|
||||||
"reasoning_about_colored_objects",
|
|
||||||
"ruin_names",
|
|
||||||
"salient_translation_error_detection",
|
|
||||||
"snarks",
|
|
||||||
"sports_understanding",
|
|
||||||
"temporal_sequences",
|
|
||||||
"tracking_shuffled_objects_five_objects",
|
|
||||||
"tracking_shuffled_objects_seven_objects",
|
|
||||||
"tracking_shuffled_objects_three_objects",
|
|
||||||
"web_of_lies",
|
|
||||||
"word_sorting",
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
assert dataset_name == "gsm8k"
|
|
||||||
assert task_name in {"train", "test"}
|
|
||||||
|
|
||||||
assert scorer_llm_name in {
|
|
||||||
"text-bison",
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
"gpt-4",
|
|
||||||
"local",
|
|
||||||
}
|
|
||||||
assert optimizer_llm_name in {
|
|
||||||
"text-bison",
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
"gpt-4",
|
|
||||||
"local",
|
|
||||||
}
|
|
||||||
assert meta_prompt_type in {
|
|
||||||
"both_instructions_and_exemplars",
|
|
||||||
"instructions_only",
|
|
||||||
}
|
|
||||||
|
|
||||||
instruction_pos = _INSTRUCTION_POS.value
|
|
||||||
assert instruction_pos in {
|
|
||||||
"before_Q",
|
|
||||||
"Q_begin",
|
|
||||||
"Q_end",
|
|
||||||
"A_begin",
|
|
||||||
}, (
|
|
||||||
"The instruction position should be either before the question, or at the"
|
|
||||||
" beginning of the question, at the end of the question, or at the"
|
|
||||||
" beginning of the answer."
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
f"scorer: {scorer_llm_name}, optimizer: {optimizer_llm_name}, dataset:"
|
|
||||||
f" {dataset_name}, task: {task_name}, instruction_pos: {instruction_pos}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
|
|
||||||
assert openai_api_key, "The OpenAI API key must be provided."
|
|
||||||
openai.api_key = openai_api_key
|
|
||||||
elif scorer_llm_name == "text-bison":
|
|
||||||
assert palm_api_key, "A PaLM API key is needed when prompting the text-bison model."
|
|
||||||
palm.configure(api_key=palm_api_key)
|
|
||||||
elif scorer_llm_name == "local":
|
|
||||||
# 本地模型,无需 API key
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown scorer model: {scorer_llm_name}")
|
|
||||||
|
|
||||||
if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
|
|
||||||
assert openai_api_key, "The OpenAI API key must be provided."
|
|
||||||
openai.api_key = openai_api_key
|
|
||||||
elif optimizer_llm_name == "text-bison":
|
|
||||||
assert palm_api_key, "A PaLM API key is needed when prompting the text-bison model."
|
|
||||||
palm.configure(api_key=palm_api_key)
|
|
||||||
elif optimizer_llm_name == "local":
|
|
||||||
# 本地模型,无需 API key
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown scorer model: {optimizer_llm_name}")
|
|
||||||
|
|
||||||
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "MMLU-data")
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
root_data_folder_path = os.path.join(
|
|
||||||
ROOT_DATA_FOLDER_PATH, "BIG-Bench-Hard-data/"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
assert dataset_name == "gsm8k"
|
|
||||||
root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "gsm_data")
|
|
||||||
|
|
||||||
# =================== create the result directory ==========================
|
|
||||||
datetime_str = (
|
|
||||||
str(datetime.datetime.now().replace(microsecond=0))
|
|
||||||
.replace(" ", "-")
|
|
||||||
.replace(":", "-")
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
save_folder = os.path.join(
|
|
||||||
OPRO_ROOT_PATH,
|
|
||||||
"outputs",
|
|
||||||
"optimization-results",
|
|
||||||
f"{dataset_name.upper()}-{task_name}-s-{scorer_llm_name}-o-{optimizer_llm_name}-{datetime_str}/",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
result_by_instruction_folder = os.path.join(
|
|
||||||
save_folder, "result_by_instruction"
|
|
||||||
)
|
|
||||||
print(f"Results will be saved to: {os.path.abspath(result_by_instruction_folder)}")
|
|
||||||
os.makedirs(result_by_instruction_folder,exist_ok=True)
|
|
||||||
print(f"result directory:\n{save_folder}")
|
|
||||||
|
|
||||||
# ====================== scorer model configs ==============================
|
|
||||||
|
|
||||||
|
|
||||||
if scorer_llm_name == "text-bison":
|
|
||||||
# when prompting text-bison with Cloud API
|
|
||||||
scorer_finetuned_palm_temperature = 0.0
|
|
||||||
scorer_finetuned_palm_max_decode_steps = 1024
|
|
||||||
scorer_finetuned_palm_batch_size = 1
|
|
||||||
scorer_finetuned_palm_num_servers = 1
|
|
||||||
scorer_finetuned_palm_dict = dict()
|
|
||||||
scorer_finetuned_palm_dict["temperature"] = (
|
|
||||||
scorer_finetuned_palm_temperature
|
|
||||||
)
|
|
||||||
scorer_finetuned_palm_dict["num_servers"] = (
|
|
||||||
scorer_finetuned_palm_num_servers
|
|
||||||
)
|
|
||||||
scorer_finetuned_palm_dict["batch_size"] = scorer_finetuned_palm_batch_size
|
|
||||||
scorer_finetuned_palm_dict["max_decode_steps"] = (
|
|
||||||
scorer_finetuned_palm_max_decode_steps
|
|
||||||
)
|
|
||||||
|
|
||||||
call_scorer_finetuned_palm_server_func = functools.partial(
|
|
||||||
prompt_utils.call_palm_server_from_cloud,
|
|
||||||
model="text-bison-001",
|
|
||||||
temperature=scorer_finetuned_palm_dict["temperature"],
|
|
||||||
max_decode_steps=scorer_finetuned_palm_dict["max_decode_steps"],
|
|
||||||
)
|
|
||||||
|
|
||||||
scorer_llm_dict = {
|
|
||||||
"model_type": scorer_llm_name.lower(),
|
|
||||||
}
|
|
||||||
scorer_llm_dict.update(scorer_finetuned_palm_dict)
|
|
||||||
call_scorer_server_func = call_scorer_finetuned_palm_server_func
|
|
||||||
|
|
||||||
|
|
||||||
elif scorer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
|
|
||||||
# 改成调用本地vLLM版本的函数
|
|
||||||
scorer_gpt_max_decode_steps = 1024
|
|
||||||
# scorer_gpt_max_decode_steps = 512
|
|
||||||
scorer_gpt_temperature = 0.0
|
|
||||||
|
|
||||||
scorer_llm_dict = {
|
|
||||||
"model_type": scorer_llm_name.lower(),
|
|
||||||
"max_decode_steps": scorer_gpt_max_decode_steps,
|
|
||||||
"temperature": scorer_gpt_temperature,
|
|
||||||
"num_decodes": 1,
|
|
||||||
"batch_size": 1,
|
|
||||||
"num_servers": 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
call_scorer_server_func = functools.partial(
|
|
||||||
prompt_utils.call_openai_server_func, # 你本地实现的vLLM调用函数
|
|
||||||
max_decode_steps=scorer_gpt_max_decode_steps,
|
|
||||||
temperature=scorer_gpt_temperature,
|
|
||||||
local_model_path=FLAGS.local_model_path, # 传入你本地模型路径
|
|
||||||
)
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported scorer_llm_name: {scorer_llm_name}")
|
|
||||||
|
|
||||||
|
|
||||||
# ====================== optimizer model configs ============================
|
|
||||||
if optimizer_llm_name.lower() == "text-bison":
|
|
||||||
# when prompting text-bison with Cloud API
|
|
||||||
optimizer_finetuned_palm_temperature = 1.0
|
|
||||||
optimizer_finetuned_palm_num_decodes = 8
|
|
||||||
optimizer_finetuned_palm_max_decode_steps = 1024
|
|
||||||
optimizer_finetuned_palm_batch_size = 1
|
|
||||||
optimizer_finetuned_palm_num_servers = 1
|
|
||||||
optimizer_finetuned_palm_dict = dict()
|
|
||||||
optimizer_finetuned_palm_dict["temperature"] = (
|
|
||||||
optimizer_finetuned_palm_temperature
|
|
||||||
)
|
|
||||||
optimizer_finetuned_palm_dict["num_decodes"] = (
|
|
||||||
optimizer_finetuned_palm_num_decodes
|
|
||||||
)
|
|
||||||
optimizer_finetuned_palm_dict["batch_size"] = (
|
|
||||||
optimizer_finetuned_palm_batch_size
|
|
||||||
)
|
|
||||||
optimizer_finetuned_palm_dict["num_servers"] = (
|
|
||||||
optimizer_finetuned_palm_num_servers
|
|
||||||
)
|
|
||||||
optimizer_finetuned_palm_dict["max_decode_steps"] = (
|
|
||||||
optimizer_finetuned_palm_max_decode_steps
|
|
||||||
)
|
|
||||||
|
|
||||||
call_optimizer_finetuned_palm_server_func = functools.partial(
|
|
||||||
prompt_utils.call_palm_server_from_cloud,
|
|
||||||
model="text-bison-001",
|
|
||||||
temperature=optimizer_finetuned_palm_dict["temperature"],
|
|
||||||
max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
|
|
||||||
)
|
|
||||||
|
|
||||||
optimizer_llm_dict = {
|
|
||||||
"model_type": optimizer_llm_name.lower(),
|
|
||||||
}
|
|
||||||
optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
|
|
||||||
call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
|
|
||||||
|
|
||||||
elif optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
|
|
||||||
# 用本地 vLLM 版本替代调用
|
|
||||||
optimizer_gpt_max_decode_steps = 512
|
|
||||||
|
|
||||||
optimizer_gpt_temperature = 1.0
|
|
||||||
|
|
||||||
optimizer_llm_dict = {
|
|
||||||
"max_decode_steps": optimizer_gpt_max_decode_steps,
|
|
||||||
"temperature": optimizer_gpt_temperature,
|
|
||||||
"batch_size": 1,
|
|
||||||
"num_decodes": 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
call_optimizer_server_func = functools.partial(
|
|
||||||
prompt_utils.call_openai_server_func, # 你写的本地vLLM调用接口
|
|
||||||
max_decode_steps=optimizer_gpt_max_decode_steps,
|
|
||||||
temperature=optimizer_gpt_temperature,
|
|
||||||
local_model_path=FLAGS.local_model_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported optimizer_llm_name: {optimizer_llm_name}")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ====================== try calling the servers ============================
|
|
||||||
print("\n======== testing the scorer and optimizer servers ===========")
|
|
||||||
scorer_test_output = call_scorer_server_func(
|
|
||||||
"Does the sun rise from the north? Just answer yes or no."
|
|
||||||
)
|
|
||||||
print(f"number of scorer output decodes: {len(scorer_test_output)}")
|
|
||||||
print(f"scorer test output: {scorer_test_output}")
|
|
||||||
optimizer_test_output = call_optimizer_server_func(
|
|
||||||
"Does the sun rise from the north? Just answer yes or no.",
|
|
||||||
temperature=1.0,
|
|
||||||
)
|
|
||||||
print(f"number of optimizer output decodes: {len(optimizer_test_output)}")
|
|
||||||
print(f"optimizer test output: {optimizer_test_output}")
|
|
||||||
print("Finished testing the servers.")
|
|
||||||
|
|
||||||
# ====================== read data ============================
|
|
||||||
print("\n================ prompt optimization settings ==============")
|
|
||||||
# from https://github.com/hendrycks/test/blob/master/categories.py
|
|
||||||
subcategories = {
|
|
||||||
"abstract_algebra": ["math"],
|
|
||||||
"anatomy": ["health"],
|
|
||||||
"astronomy": ["physics"],
|
|
||||||
"business_ethics": ["business"],
|
|
||||||
"clinical_knowledge": ["health"],
|
|
||||||
"college_biology": ["biology"],
|
|
||||||
"college_chemistry": ["chemistry"],
|
|
||||||
"college_computer_science": ["computer science"],
|
|
||||||
"college_mathematics": ["math"],
|
|
||||||
"college_medicine": ["health"],
|
|
||||||
"college_physics": ["physics"],
|
|
||||||
"computer_security": ["computer science"],
|
|
||||||
"conceptual_physics": ["physics"],
|
|
||||||
"econometrics": ["economics"],
|
|
||||||
"electrical_engineering": ["engineering"],
|
|
||||||
"elementary_mathematics": ["math"],
|
|
||||||
"formal_logic": ["philosophy"],
|
|
||||||
"global_facts": ["other"],
|
|
||||||
"high_school_biology": ["biology"],
|
|
||||||
"high_school_chemistry": ["chemistry"],
|
|
||||||
"high_school_computer_science": ["computer science"],
|
|
||||||
"high_school_european_history": ["history"],
|
|
||||||
"high_school_geography": ["geography"],
|
|
||||||
"high_school_government_and_politics": ["politics"],
|
|
||||||
"high_school_macroeconomics": ["economics"],
|
|
||||||
"high_school_mathematics": ["math"],
|
|
||||||
"high_school_microeconomics": ["economics"],
|
|
||||||
"high_school_physics": ["physics"],
|
|
||||||
"high_school_psychology": ["psychology"],
|
|
||||||
"high_school_statistics": ["math"],
|
|
||||||
"high_school_us_history": ["history"],
|
|
||||||
"high_school_world_history": ["history"],
|
|
||||||
"human_aging": ["health"],
|
|
||||||
"human_sexuality": ["culture"],
|
|
||||||
"international_law": ["law"],
|
|
||||||
"jurisprudence": ["law"],
|
|
||||||
"logical_fallacies": ["philosophy"],
|
|
||||||
"machine_learning": ["computer science"],
|
|
||||||
"management": ["business"],
|
|
||||||
"marketing": ["business"],
|
|
||||||
"medical_genetics": ["health"],
|
|
||||||
"miscellaneous": ["other"],
|
|
||||||
"moral_disputes": ["philosophy"],
|
|
||||||
"moral_scenarios": ["philosophy"],
|
|
||||||
"nutrition": ["health"],
|
|
||||||
"philosophy": ["philosophy"],
|
|
||||||
"prehistory": ["history"],
|
|
||||||
"professional_accounting": ["other"],
|
|
||||||
"professional_law": ["law"],
|
|
||||||
"professional_medicine": ["health"],
|
|
||||||
"professional_psychology": ["psychology"],
|
|
||||||
"public_relations": ["politics"],
|
|
||||||
"security_studies": ["politics"],
|
|
||||||
"sociology": ["culture"],
|
|
||||||
"us_foreign_policy": ["politics"],
|
|
||||||
"virology": ["health"],
|
|
||||||
"world_religions": ["philosophy"],
|
|
||||||
}
|
|
||||||
|
|
||||||
categories = {
|
|
||||||
"STEM": [
|
|
||||||
"physics",
|
|
||||||
"chemistry",
|
|
||||||
"biology",
|
|
||||||
"computer science",
|
|
||||||
"math",
|
|
||||||
"engineering",
|
|
||||||
],
|
|
||||||
"humanities": ["history", "philosophy", "law"],
|
|
||||||
"social sciences": [
|
|
||||||
"politics",
|
|
||||||
"culture",
|
|
||||||
"economics",
|
|
||||||
"geography",
|
|
||||||
"psychology",
|
|
||||||
],
|
|
||||||
"other (business, health, misc.)": ["other", "business", "health"],
|
|
||||||
}
|
|
||||||
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
|
|
||||||
category_names = [task_name]
|
|
||||||
folder_name = "test" # one of {'auxiliary_train', 'dev', 'val', 'test'}
|
|
||||||
task_names = []
|
|
||||||
for task_csv_name in os.listdir(
|
|
||||||
os.path.join(root_data_folder_path, folder_name)
|
|
||||||
):
|
|
||||||
task_names.append(task_csv_name.split(".")[0])
|
|
||||||
|
|
||||||
tasks_in_category = []
|
|
||||||
for category_name in category_names:
|
|
||||||
for task_name in task_names:
|
|
||||||
for subname in subcategories:
|
|
||||||
if subname in task_name:
|
|
||||||
if subcategories[subname][0] in categories[category_name]:
|
|
||||||
tasks_in_category.append(task_name)
|
|
||||||
break
|
|
||||||
|
|
||||||
tasks_all = [(folder_name, task_name) for task_name in tasks_in_category]
|
|
||||||
multiple_choice_tasks = set([item[1] for item in tasks_all])
|
|
||||||
boolean_tasks = set()
|
|
||||||
numerical_output_tasks = set()
|
|
||||||
|
|
||||||
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
tasks_all = [task_name]
|
|
||||||
assert (
|
|
||||||
len(tasks_all) == 1
|
|
||||||
), "for now only support prompt optimization on one BBH task"
|
|
||||||
|
|
||||||
|
|
||||||
numerical_output_tasks = {
|
|
||||||
"object_counting",
|
|
||||||
"multistep_arithmetic_two",
|
|
||||||
}
|
|
||||||
|
|
||||||
multiple_choice_tasks = {
|
|
||||||
"date_understanding",
|
|
||||||
"disambiguation_qa",
|
|
||||||
"geometric_shapes",
|
|
||||||
"hyperbaton",
|
|
||||||
"logical_deduction_five_objects",
|
|
||||||
"logical_deduction_seven_objects",
|
|
||||||
"logical_deduction_three_objects",
|
|
||||||
"movie_recommendation",
|
|
||||||
"penguins_in_a_table",
|
|
||||||
"reasoning_about_colored_objects",
|
|
||||||
"ruin_names",
|
|
||||||
"salient_translation_error_detection",
|
|
||||||
"snarks",
|
|
||||||
"temporal_sequences",
|
|
||||||
"tracking_shuffled_objects_five_objects",
|
|
||||||
"tracking_shuffled_objects_seven_objects",
|
|
||||||
"tracking_shuffled_objects_three_objects",
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean_tasks = {
|
|
||||||
"boolean_expressions", # True or False
|
|
||||||
"causal_judgement", # yes or no
|
|
||||||
"formal_fallacies", # valid or invalid
|
|
||||||
"navigate", # yes or no
|
|
||||||
"sports_understanding", # yes or no
|
|
||||||
"web_of_lies", # yes or no
|
|
||||||
}
|
|
||||||
|
|
||||||
else:
|
|
||||||
assert dataset_name in {"gsm8k"}
|
|
||||||
tasks_all = [task_name]
|
|
||||||
multiple_choice_tasks = set()
|
|
||||||
boolean_tasks = set()
|
|
||||||
numerical_output_tasks = set(tasks_all)
|
|
||||||
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
raw_data = pd.DataFrame()
|
|
||||||
prediction_treat_as_number = False
|
|
||||||
prediction_treat_as_bool = False
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
raw_data = []
|
|
||||||
prediction_treat_as_number = bool(
|
|
||||||
tasks_all[0] in numerical_output_tasks
|
|
||||||
) # for now only check the first task
|
|
||||||
prediction_treat_as_bool = bool(
|
|
||||||
tasks_all[0] in boolean_tasks
|
|
||||||
) # for now only check the first task
|
|
||||||
print(
|
|
||||||
f"prediction_treat_as_number: {prediction_treat_as_number},"
|
|
||||||
f" prediction_treat_as_bool: {prediction_treat_as_bool}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
assert dataset_name == "gsm8k"
|
|
||||||
raw_data = pd.DataFrame()
|
|
||||||
prediction_treat_as_number = True
|
|
||||||
prediction_treat_as_bool = False
|
|
||||||
|
|
||||||
for t in tasks_all:
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
folder_name = t[0]
|
|
||||||
task_name = t[1]
|
|
||||||
single_task_df = pd.read_csv(
|
|
||||||
os.path.join(root_data_folder_path, f"{folder_name}/{task_name}.csv"),
|
|
||||||
index_col=None,
|
|
||||||
header=None,
|
|
||||||
)
|
|
||||||
raw_data = pd.concat([raw_data, single_task_df])
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
task_name = t
|
|
||||||
single_task_list = opt_utils.load_bbh_task_data(
|
|
||||||
task_name, base_dir=root_data_folder_path
|
|
||||||
)
|
|
||||||
raw_data += single_task_list
|
|
||||||
else:
|
|
||||||
assert dataset_name == "gsm8k"
|
|
||||||
task_name = t
|
|
||||||
f_gsm = os.path.join(root_data_folder_path, f"gsm_{task_name}.tsv")
|
|
||||||
single_task_df = pd.read_csv(f_gsm, sep="\t", header=None)
|
|
||||||
raw_data = pd.concat([raw_data, single_task_df])
|
|
||||||
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
num_examples = raw_data.shape[0]
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
num_examples = len(raw_data)
|
|
||||||
else:
|
|
||||||
assert dataset_name in {"gsm8k"}
|
|
||||||
num_examples = raw_data.shape[0]
|
|
||||||
print(f"number of examples in the current task: {num_examples}")
|
|
||||||
|
|
||||||
# ================ split data into train/val/test ==========================
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
train_ratio = 0.8
|
|
||||||
|
|
||||||
eval_ratio = 0.2
|
|
||||||
elif dataset_name == "gsm8k":
|
|
||||||
# train_ratio = 0.035
|
|
||||||
train_ratio = 0.01 # 原来是 0.035,改成 0.01,约 74 条
|
|
||||||
eval_ratio = 0
|
|
||||||
else:
|
|
||||||
assert dataset_name == "bbh"
|
|
||||||
train_ratio = 0.2
|
|
||||||
eval_ratio = 0
|
|
||||||
|
|
||||||
|
|
||||||
assert train_ratio + eval_ratio <= 1
|
|
||||||
test_ratio = 1 - train_ratio - eval_ratio
|
|
||||||
print(
|
|
||||||
f"train_ratio: {train_ratio}, eval_ratio: {eval_ratio}, "
|
|
||||||
f"test_ratio: {test_ratio}"
|
|
||||||
)
|
|
||||||
np.random.seed(0)
|
|
||||||
train_index = np.sort(
|
|
||||||
np.array(
|
|
||||||
np.random.choice(
|
|
||||||
num_examples, size=int(train_ratio * num_examples), replace=False
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
eval_and_test_index = np.sort(
|
|
||||||
np.array(list(set(np.arange(num_examples)) - set(train_index)))
|
|
||||||
)
|
|
||||||
eval_index = np.sort(
|
|
||||||
np.array(
|
|
||||||
np.random.choice(
|
|
||||||
eval_and_test_index,
|
|
||||||
size=int(eval_ratio * num_examples),
|
|
||||||
replace=False,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# ========== set other optimization experiment hyperparameters ==============
|
|
||||||
if scorer_llm_name == "text-bison":
|
|
||||||
old_instruction_score_threshold = 0.0
|
|
||||||
# old_instruction_score_threshold = 0.15 # for GSM8K
|
|
||||||
else:
|
|
||||||
assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4", "local"}
|
|
||||||
old_instruction_score_threshold = 0.3
|
|
||||||
|
|
||||||
if scorer_llm_name == "text-bison":
|
|
||||||
extract_final_answer_by_prompting_again = False
|
|
||||||
include_qa = False
|
|
||||||
evaluate_in_parallel = False
|
|
||||||
else:
|
|
||||||
assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4", "local"}
|
|
||||||
extract_final_answer_by_prompting_again = False
|
|
||||||
include_qa = False
|
|
||||||
evaluate_in_parallel = False
|
|
||||||
|
|
||||||
optimizer_llm_temperature = optimizer_llm_dict["temperature"]
|
|
||||||
|
|
||||||
|
|
||||||
# num_few_shot_questions_for_instruction_refinement = 3
|
|
||||||
num_few_shot_questions_for_instruction_refinement = 1 # 减少 few-shot 例子数
|
|
||||||
|
|
||||||
# num_generated_instructions_in_each_step = 8
|
|
||||||
num_generated_instructions_in_each_step = 2 # 每步只生成 2 条指令
|
|
||||||
|
|
||||||
# num_search_steps = 200
|
|
||||||
num_search_steps = 3 # 原来是 200,改成 3 步即可
|
|
||||||
|
|
||||||
|
|
||||||
initial_instructions = [
|
|
||||||
"Let's solve the problem.",
|
|
||||||
# "",
|
|
||||||
# "The answer is",
|
|
||||||
]
|
|
||||||
few_shot_qa_pairs = True
|
|
||||||
# one of {'accumulative_most_frequent', 'current_most_frequent', 'random',
|
|
||||||
# 'constant'}
|
|
||||||
few_shot_selection_criteria = "random"
|
|
||||||
# whether to evaluate generated instructions on the exemplars in meta-prompt
|
|
||||||
evaluate_generated_ins_on_few_shot = False
|
|
||||||
# whether to evaluate old instructions on the exemplars in the meta-prompt
|
|
||||||
evaluate_old_ins_on_few_shot = False
|
|
||||||
# every this number of steps, compute the accuracies of current-step
|
|
||||||
# instructions on the validation set
|
|
||||||
# eval_interval = 3
|
|
||||||
eval_interval = 1 # 每步就 eval 一次,及时看到结果
|
|
||||||
# eval_interval = 10
|
|
||||||
max_num_instructions = (
|
|
||||||
20 # the maximum number of instructions and scores in the meta-prompt
|
|
||||||
)
|
|
||||||
# The number of buckets when converting scores to integers in the meta-prompt.
|
|
||||||
num_score_buckets = 100
|
|
||||||
# whether to put old instructions and scores to before exemplars in
|
|
||||||
# the meta-prompt
|
|
||||||
meta_prompt_instructions_before_exemplars = True
|
|
||||||
|
|
||||||
# ===================== run prompt optimization ======================
|
|
||||||
|
|
||||||
assert few_shot_selection_criteria in {
|
|
||||||
"accumulative_most_frequent",
|
|
||||||
"current_most_frequent",
|
|
||||||
"random",
|
|
||||||
"constant",
|
|
||||||
}
|
|
||||||
evolution_kwargs = {
|
|
||||||
"num_search_steps": num_search_steps,
|
|
||||||
"old_instruction_score_threshold": old_instruction_score_threshold,
|
|
||||||
"scorer_llm_dict": scorer_llm_dict,
|
|
||||||
"optimizer_llm_dict": optimizer_llm_dict,
|
|
||||||
"extract_final_answer_by_prompting_again": (
|
|
||||||
extract_final_answer_by_prompting_again
|
|
||||||
),
|
|
||||||
"include_qa": include_qa,
|
|
||||||
"evaluate_in_parallel": evaluate_in_parallel,
|
|
||||||
"tasks_all": tasks_all,
|
|
||||||
"train_ratio": train_ratio,
|
|
||||||
"eval_ratio": eval_ratio,
|
|
||||||
"test_ratio": test_ratio,
|
|
||||||
"train_index": train_index,
|
|
||||||
"eval_index": eval_index,
|
|
||||||
"dataset_name": dataset_name,
|
|
||||||
"task_name": task_name,
|
|
||||||
"num_examples": num_examples,
|
|
||||||
"root_data_folder_path": root_data_folder_path,
|
|
||||||
"optimizer_llm_temperature": optimizer_llm_temperature,
|
|
||||||
# "optimizer_llm_temperature_schedule": (
|
|
||||||
# optimizer_llm_temperature_schedule
|
|
||||||
# ),
|
|
||||||
# "optimizer_llm_temperature_end": optimizer_llm_temperature_end,
|
|
||||||
"initial_instructions": initial_instructions,
|
|
||||||
"multiple_choice_tasks": multiple_choice_tasks,
|
|
||||||
"raw_data": raw_data,
|
|
||||||
"call_scorer_server_func": call_scorer_server_func,
|
|
||||||
"call_optimizer_server_func": call_optimizer_server_func,
|
|
||||||
"instruction_pos": instruction_pos,
|
|
||||||
"prediction_treat_as_number": prediction_treat_as_number,
|
|
||||||
"prediction_treat_as_bool": prediction_treat_as_bool,
|
|
||||||
"result_by_instruction_folder": result_by_instruction_folder,
|
|
||||||
"few_shot_qa_pairs": few_shot_qa_pairs,
|
|
||||||
"num_score_buckets": num_score_buckets,
|
|
||||||
"max_num_instructions": max_num_instructions,
|
|
||||||
"meta_prompt_type": meta_prompt_type,
|
|
||||||
"meta_prompt_instructions_before_exemplars": (
|
|
||||||
meta_prompt_instructions_before_exemplars
|
|
||||||
),
|
|
||||||
"few_shot_selection_criteria": few_shot_selection_criteria,
|
|
||||||
"optimizer_llm_name": optimizer_llm_name,
|
|
||||||
"num_generated_instructions_in_each_step": (
|
|
||||||
num_generated_instructions_in_each_step
|
|
||||||
),
|
|
||||||
"evaluate_generated_ins_on_few_shot": evaluate_generated_ins_on_few_shot,
|
|
||||||
"num_few_shot_questions_for_instruction_refinement": (
|
|
||||||
num_few_shot_questions_for_instruction_refinement
|
|
||||||
),
|
|
||||||
"evaluate_old_ins_on_few_shot": evaluate_old_ins_on_few_shot,
|
|
||||||
"eval_interval": eval_interval,
|
|
||||||
"save_folder": save_folder,
|
|
||||||
}
|
|
||||||
print("=== 开始优化过程 ===")
|
|
||||||
try:
|
|
||||||
opt_utils.run_evolution(**evolution_kwargs)
|
|
||||||
print("=== 优化完成 ===")
|
|
||||||
except Exception as e:
|
|
||||||
import traceback
|
|
||||||
print(f"!!! 优化失败: {e} !!!", file=sys.stderr)
|
|
||||||
traceback.print_exc()
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
app.run(main)
|
|
||||||
|
|
||||||
@@ -1,424 +0,0 @@
|
|||||||
# Copyright 2023 The OPRO Authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
r"""Optimize over the objective function of a linear regression problem.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
|
|
||||||
```
|
|
||||||
python optimize_linear_regression.py --optimizer="text-bison"
|
|
||||||
```
|
|
||||||
|
|
||||||
Note:
|
|
||||||
- When using a Google-Cloud-served model (like text-bison at
|
|
||||||
https://developers.generativeai.google/tutorials/text_quickstart), add
|
|
||||||
`--palm_api_key="<your_key>"`
|
|
||||||
- When using an OpenAI model, add `--openai_api_key="<your_key>"`
|
|
||||||
"""
|
|
||||||
|
|
||||||
import datetime
|
|
||||||
import functools
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
|
|
||||||
OPRO_ROOT_PATH = os.path.dirname(
|
|
||||||
os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
|
|
||||||
)
|
|
||||||
sys.path.insert(0, OPRO_ROOT_PATH)
|
|
||||||
|
|
||||||
from absl import app
|
|
||||||
from absl import flags
|
|
||||||
import google.generativeai as palm
|
|
||||||
import numpy as np
|
|
||||||
import openai
|
|
||||||
|
|
||||||
from opro import prompt_utils
|
|
||||||
|
|
||||||
_OPENAI_API_KEY = flags.DEFINE_string(
|
|
||||||
"openai_api_key", "", "The OpenAI API key."
|
|
||||||
)
|
|
||||||
|
|
||||||
_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
|
|
||||||
|
|
||||||
_OPTIMIZER = flags.DEFINE_string(
|
|
||||||
"optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def main(_):
|
|
||||||
# ============== set optimization experiment configurations ================
|
|
||||||
num_points = 50 # number of points in linear regression
|
|
||||||
w_true = 15 # the true w
|
|
||||||
b_true = 14 # the true b
|
|
||||||
max_num_steps = 500 # the number of optimization steps
|
|
||||||
num_reps = 5 # the number of repeated runs
|
|
||||||
max_num_pairs = 20 # the maximum number of input-output pairs in meta-prompt
|
|
||||||
num_input_decimals = 0 # num of decimals for input values in meta-prompt
|
|
||||||
num_output_decimals = 0 # num of decimals for output values in meta-prompt
|
|
||||||
num_generated_points_in_each_step = 8
|
|
||||||
|
|
||||||
# ================ load LLM settings ===================
|
|
||||||
optimizer_llm_name = _OPTIMIZER.value
|
|
||||||
assert optimizer_llm_name in {
|
|
||||||
"text-bison",
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
"gpt-4",
|
|
||||||
}
|
|
||||||
openai_api_key = _OPENAI_API_KEY.value
|
|
||||||
palm_api_key = _PALM_API_KEY.value
|
|
||||||
|
|
||||||
if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
|
|
||||||
assert openai_api_key, "The OpenAI API key must be provided."
|
|
||||||
openai.api_key = openai_api_key
|
|
||||||
else:
|
|
||||||
assert optimizer_llm_name == "text-bison"
|
|
||||||
assert (
|
|
||||||
palm_api_key
|
|
||||||
), "A PaLM API key is needed when prompting the text-bison model."
|
|
||||||
palm.configure(api_key=palm_api_key)
|
|
||||||
|
|
||||||
# =================== create the result directory ==========================
|
|
||||||
datetime_str = (
|
|
||||||
str(datetime.datetime.now().replace(microsecond=0))
|
|
||||||
.replace(" ", "-")
|
|
||||||
.replace(":", "-")
|
|
||||||
)
|
|
||||||
|
|
||||||
save_folder = os.path.join(
|
|
||||||
OPRO_ROOT_PATH,
|
|
||||||
"outputs",
|
|
||||||
"optimization-results",
|
|
||||||
f"linear_regression-o-{optimizer_llm_name}-{datetime_str}/",
|
|
||||||
)
|
|
||||||
os.makedirs(save_folder)
|
|
||||||
print(f"result directory:\n{save_folder}")
|
|
||||||
|
|
||||||
# ====================== optimizer model configs ============================
|
|
||||||
if optimizer_llm_name.lower() == "text-bison":
|
|
||||||
# when prompting text-bison with Cloud API
|
|
||||||
optimizer_finetuned_palm_temperature = 1.0
|
|
||||||
optimizer_finetuned_palm_max_decode_steps = 1024
|
|
||||||
optimizer_finetuned_palm_batch_size = 1
|
|
||||||
optimizer_finetuned_palm_num_servers = 1
|
|
||||||
optimizer_finetuned_palm_dict = dict()
|
|
||||||
optimizer_finetuned_palm_dict["temperature"] = (
|
|
||||||
optimizer_finetuned_palm_temperature
|
|
||||||
)
|
|
||||||
optimizer_finetuned_palm_dict["batch_size"] = (
|
|
||||||
optimizer_finetuned_palm_batch_size
|
|
||||||
)
|
|
||||||
optimizer_finetuned_palm_dict["num_servers"] = (
|
|
||||||
optimizer_finetuned_palm_num_servers
|
|
||||||
)
|
|
||||||
optimizer_finetuned_palm_dict["max_decode_steps"] = (
|
|
||||||
optimizer_finetuned_palm_max_decode_steps
|
|
||||||
)
|
|
||||||
|
|
||||||
call_optimizer_finetuned_palm_server_func = functools.partial(
|
|
||||||
prompt_utils.call_palm_server_from_cloud,
|
|
||||||
# prompt_utils.call_vllm,
|
|
||||||
model="text-bison-001",
|
|
||||||
temperature=optimizer_finetuned_palm_dict["temperature"],
|
|
||||||
max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
|
|
||||||
)
|
|
||||||
|
|
||||||
optimizer_llm_dict = {
|
|
||||||
"model_type": optimizer_llm_name.lower(),
|
|
||||||
}
|
|
||||||
optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
|
|
||||||
call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
|
|
||||||
|
|
||||||
else:
|
|
||||||
assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
|
|
||||||
optimizer_gpt_max_decode_steps = 1024
|
|
||||||
optimizer_gpt_temperature = 1.0
|
|
||||||
|
|
||||||
optimizer_llm_dict = dict()
|
|
||||||
optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps
|
|
||||||
optimizer_llm_dict["temperature"] = optimizer_gpt_temperature
|
|
||||||
optimizer_llm_dict["batch_size"] = 1
|
|
||||||
call_optimizer_server_func = functools.partial(
|
|
||||||
prompt_utils.call_openai_server_func,
|
|
||||||
model=optimizer_llm_name,
|
|
||||||
max_decode_steps=optimizer_gpt_max_decode_steps,
|
|
||||||
temperature=optimizer_gpt_temperature,
|
|
||||||
)
|
|
||||||
|
|
||||||
# ====================== try calling the servers ============================
|
|
||||||
print("\n======== testing the optimizer server ===========")
|
|
||||||
optimizer_test_output = call_optimizer_server_func(
|
|
||||||
"Does the sun rise from the north? Just answer yes or no.",
|
|
||||||
temperature=1.0,
|
|
||||||
)
|
|
||||||
print(f"optimizer test output: {optimizer_test_output}")
|
|
||||||
print("Finished testing the optimizer server.")
|
|
||||||
print("\n=================================================")
|
|
||||||
|
|
||||||
# ====================== utility functions ============================
|
|
||||||
def evaluate_loss(X, y, w, b): # pylint: disable=invalid-name
|
|
||||||
residual = y - (X * w + b)
|
|
||||||
return np.linalg.norm(residual) ** 2
|
|
||||||
|
|
||||||
def gen_meta_prompt(
|
|
||||||
old_value_pairs_set,
|
|
||||||
X, # pylint: disable=invalid-name, unused-argument
|
|
||||||
y, # pylint: disable=unused-argument
|
|
||||||
num_input_decimals=5,
|
|
||||||
num_output_decimals=5,
|
|
||||||
max_num_pairs=100,
|
|
||||||
):
|
|
||||||
"""Generate the meta-prompt for optimization.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
old_value_pairs_set (set): the set of old (w, b, z) pairs.
|
|
||||||
X (np.array): the 1D array of x values.
|
|
||||||
y (np.array): the 1D array of y values.
|
|
||||||
num_input_decimals (int): the number of decimals for (w, b) in the
|
|
||||||
meta-prompt.
|
|
||||||
num_output_decimals (int): the number of decimals for z in the meta-prompt.
|
|
||||||
max_num_pairs (int): the maximum number of exemplars in the meta-prompt.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
meta_prompt (str): the generated meta-prompt.
|
|
||||||
"""
|
|
||||||
old_value_pairs_set = set(
|
|
||||||
[ # pylint: disable=g-complex-comprehension
|
|
||||||
(
|
|
||||||
np.round(w, num_input_decimals)
|
|
||||||
if num_input_decimals > 0
|
|
||||||
else int(w),
|
|
||||||
np.round(b, num_input_decimals)
|
|
||||||
if num_input_decimals > 0
|
|
||||||
else int(b),
|
|
||||||
np.round(z, num_output_decimals)
|
|
||||||
if num_output_decimals > 0
|
|
||||||
else int(z),
|
|
||||||
)
|
|
||||||
for w, b, z in old_value_pairs_set
|
|
||||||
]
|
|
||||||
)
|
|
||||||
old_value_pairs = list(old_value_pairs_set)
|
|
||||||
old_value_pairs = sorted(old_value_pairs, key=lambda x: -x[2])[
|
|
||||||
-max_num_pairs:
|
|
||||||
]
|
|
||||||
old_value_pairs_substr = ""
|
|
||||||
for w, b, z in old_value_pairs:
|
|
||||||
old_value_pairs_substr += f"\ninput:\nw={w}, b={b}\nvalue:\n{z}\n"
|
|
||||||
meta_prompt = """
|
|
||||||
Now you will help me minimize a function with two input variables w, b. I have some (w, b) pairs and the function values at those points. The pairs are arranged in descending order based on their function values, where lower values are better.
|
|
||||||
""".strip()
|
|
||||||
meta_prompt += "\n\n"
|
|
||||||
meta_prompt += old_value_pairs_substr.strip()
|
|
||||||
meta_prompt += "\n\n"
|
|
||||||
# function_analytic_form = ""
|
|
||||||
# for xi, yi in zip(X, y):
|
|
||||||
# function_analytic_form += f"({yi:.4f} - ({xi:.4f} * w + b)) ** 2 + "
|
|
||||||
# function_analytic_form = function_analytic_form[:-3]
|
|
||||||
# meta_prompt += (
|
|
||||||
# "The function has the analytic form f(w, b) ="
|
|
||||||
# f" {function_analytic_form}. When evaluating the value of a (w, b)"
|
|
||||||
# " pair, you should replace the w and b in the analytic form with your"
|
|
||||||
# " values and do the computation."
|
|
||||||
# )
|
|
||||||
# meta_prompt += "\n\n"
|
|
||||||
meta_prompt += """Give me a new (w, b) pair that is different from all pairs above, and has a function value lower than any of the above. Do not write code. The output must end with a pair [w, b], where w and b are numerical values.
|
|
||||||
""".strip()
|
|
||||||
return meta_prompt
|
|
||||||
|
|
||||||
def extract_string_in_square_brackets(input_string):
|
|
||||||
raw_result = re.findall(r"\[.*?\]", input_string)
|
|
||||||
if raw_result:
|
|
||||||
for pair in raw_result[::-1]:
|
|
||||||
if "=" not in pair and ("w" in pair or "b" in pair):
|
|
||||||
continue
|
|
||||||
return pair[1:-1]
|
|
||||||
return ""
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def parse_output(extracted_output):
|
|
||||||
"""Parse the extracted output 'w, b' string to np.array([w, b]).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
extracted_output (str): the extracted output string, like '1.5, 2.5'.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
parsed_output (np.array): the parsed output in a numpy array, like [1.5,
|
|
||||||
2.5].
|
|
||||||
"""
|
|
||||||
if not extracted_output:
|
|
||||||
return
|
|
||||||
extracted_values = []
|
|
||||||
for item in extracted_output.split(","):
|
|
||||||
if "=" in item:
|
|
||||||
item = item[item.index("=") + 1 :]
|
|
||||||
extracted_values.append(item.strip())
|
|
||||||
parsed_output = np.array(extracted_values).astype(float)
|
|
||||||
return parsed_output
|
|
||||||
|
|
||||||
configs_dict = dict()
|
|
||||||
results_dict = dict()
|
|
||||||
num_convergence_steps = []
|
|
||||||
for i_rep in range(num_reps):
|
|
||||||
found_optimal = False
|
|
||||||
print(f"\nRep {i_rep}:")
|
|
||||||
|
|
||||||
# ================= generate the ground truth X, y =====================
|
|
||||||
X = np.arange(num_points).astype(float) + 1 # pylint: disable=invalid-name
|
|
||||||
np.random.seed(i_rep + 1)
|
|
||||||
y = X * w_true + b_true + np.random.randn(num_points)
|
|
||||||
loss_at_true_values = evaluate_loss(X, y, w_true, b_true)
|
|
||||||
print(f"value at (w_true, b_true): {loss_at_true_values}")
|
|
||||||
|
|
||||||
# ================= generate the starting points =====================
|
|
||||||
num_starting_points = 5 # the number of initial points for optimization
|
|
||||||
np.random.seed((i_rep + 1) * 10)
|
|
||||||
init_w = np.random.uniform(low=10, high=20, size=num_starting_points)
|
|
||||||
np.random.seed((i_rep + 1) * 100)
|
|
||||||
init_b = np.random.uniform(low=10, high=20, size=num_starting_points)
|
|
||||||
|
|
||||||
# ====================== run optimization ============================
|
|
||||||
configs_dict_single_rep = {
|
|
||||||
"optimizer_llm_configs": optimizer_llm_dict,
|
|
||||||
"data": {
|
|
||||||
"num_points": num_points,
|
|
||||||
"w_true": w_true,
|
|
||||||
"b_true": b_true,
|
|
||||||
"loss_at_true_values": loss_at_true_values,
|
|
||||||
"X": list(X),
|
|
||||||
"y": list(y),
|
|
||||||
},
|
|
||||||
"init_w": list(init_w),
|
|
||||||
"init_b": list(init_b),
|
|
||||||
"max_num_steps": max_num_steps,
|
|
||||||
"max_num_pairs": max_num_pairs,
|
|
||||||
"num_input_decimals": num_input_decimals,
|
|
||||||
"num_output_decimals": num_output_decimals,
|
|
||||||
"num_generated_points_in_each_step": num_generated_points_in_each_step,
|
|
||||||
}
|
|
||||||
configs_dict[i_rep] = configs_dict_single_rep
|
|
||||||
configs_json_path = os.path.join(save_folder, "configs.json")
|
|
||||||
print(f"saving configs to\n{configs_json_path}")
|
|
||||||
with open(configs_json_path, "w") as f:
|
|
||||||
json.dump(configs_dict, f, indent=4)
|
|
||||||
|
|
||||||
old_value_pairs_set = set()
|
|
||||||
old_value_pairs_with_i_step = [] # format: [(w, b, z = f(w, b), i_step)]
|
|
||||||
meta_prompts_dict = dict() # format: {i_step: meta_prompt}
|
|
||||||
raw_outputs_dict = dict() # format: {i_step: raw_outputs}
|
|
||||||
|
|
||||||
rounded_inits = [
|
|
||||||
(np.round(w, num_input_decimals), np.round(b, num_input_decimals))
|
|
||||||
for w, b in zip(init_w, init_b)
|
|
||||||
]
|
|
||||||
rounded_inits = [
|
|
||||||
tuple(item) for item in list(np.unique(rounded_inits, axis=0))
|
|
||||||
]
|
|
||||||
for w, b in rounded_inits:
|
|
||||||
z = evaluate_loss(X, y, w, b)
|
|
||||||
old_value_pairs_set.add((w, b, z))
|
|
||||||
old_value_pairs_with_i_step.append((w, b, z, -1))
|
|
||||||
|
|
||||||
print("\n================ run optimization ==============")
|
|
||||||
print(
|
|
||||||
f"initial points: {[tuple(item[:2]) for item in old_value_pairs_set]}"
|
|
||||||
)
|
|
||||||
print(f"initial values: {[item[-1] for item in old_value_pairs_set]}")
|
|
||||||
results_json_path = os.path.join(save_folder, "results.json")
|
|
||||||
print(f"saving results to\n{results_json_path}")
|
|
||||||
|
|
||||||
for i_step in range(max_num_steps):
|
|
||||||
print(f"\nStep {i_step}:")
|
|
||||||
meta_prompt = gen_meta_prompt(
|
|
||||||
old_value_pairs_set,
|
|
||||||
X,
|
|
||||||
y,
|
|
||||||
num_input_decimals=num_input_decimals,
|
|
||||||
num_output_decimals=num_output_decimals,
|
|
||||||
max_num_pairs=max_num_pairs,
|
|
||||||
)
|
|
||||||
if not i_step % 5:
|
|
||||||
print("\n=================================================")
|
|
||||||
print(f"meta_prompt:\n{meta_prompt}")
|
|
||||||
meta_prompts_dict[i_step] = meta_prompt
|
|
||||||
|
|
||||||
# generate a maximum of the given number of points in each step
|
|
||||||
remaining_num_points_to_generate = num_generated_points_in_each_step
|
|
||||||
raw_outputs = []
|
|
||||||
while remaining_num_points_to_generate > 0:
|
|
||||||
raw_outputs += call_optimizer_server_func(meta_prompt)
|
|
||||||
remaining_num_points_to_generate -= optimizer_llm_dict["batch_size"]
|
|
||||||
raw_outputs = raw_outputs[:num_generated_points_in_each_step]
|
|
||||||
|
|
||||||
raw_outputs_dict[i_step] = raw_outputs
|
|
||||||
parsed_outputs = []
|
|
||||||
for string in raw_outputs:
|
|
||||||
if not i_step % 5:
|
|
||||||
print("\n=================================================")
|
|
||||||
print("raw output:\n", string)
|
|
||||||
print("\n=================================================")
|
|
||||||
try:
|
|
||||||
parsed_output = parse_output(
|
|
||||||
extract_string_in_square_brackets(string)
|
|
||||||
)
|
|
||||||
if parsed_output is not None and len(parsed_output) == 2:
|
|
||||||
parsed_outputs.append(parsed_output)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
parsed_outputs = [tuple(item) for item in parsed_outputs]
|
|
||||||
print(f"proposed points before rounding: {parsed_outputs}")
|
|
||||||
|
|
||||||
# round the proposed points to the number of decimals in meta-prompt
|
|
||||||
rounded_outputs = [
|
|
||||||
(np.round(w, num_input_decimals), np.round(b, num_input_decimals))
|
|
||||||
for w, b in parsed_outputs
|
|
||||||
]
|
|
||||||
rounded_outputs = [
|
|
||||||
tuple(item) for item in list(np.unique(rounded_outputs, axis=0))
|
|
||||||
]
|
|
||||||
print(f"proposed points after rounding: {rounded_outputs}")
|
|
||||||
|
|
||||||
# evaluate the values of proposed and rounded outputs
|
|
||||||
single_step_values = []
|
|
||||||
for w, b in rounded_outputs:
|
|
||||||
if w == w_true and b == b_true:
|
|
||||||
found_optimal = True
|
|
||||||
z = evaluate_loss(X, y, w, b)
|
|
||||||
single_step_values.append(z)
|
|
||||||
old_value_pairs_set.add((w, b, z))
|
|
||||||
old_value_pairs_with_i_step.append((w, b, z, i_step))
|
|
||||||
print(f"single_step_values: {single_step_values}")
|
|
||||||
|
|
||||||
# ====================== save results ============================
|
|
||||||
results_dict_single_rep = {
|
|
||||||
"meta_prompts": meta_prompts_dict,
|
|
||||||
"raw_outputs": raw_outputs_dict,
|
|
||||||
"old_value_pairs_with_i_step": old_value_pairs_with_i_step,
|
|
||||||
}
|
|
||||||
results_dict[i_rep] = results_dict_single_rep
|
|
||||||
with open(results_json_path, "w") as f:
|
|
||||||
json.dump(results_dict, f, indent=4)
|
|
||||||
if found_optimal:
|
|
||||||
print(
|
|
||||||
f"Repetition {i_rep+1}, optimal found at Step {i_step+1}, saving"
|
|
||||||
f" final results to\n{save_folder}"
|
|
||||||
)
|
|
||||||
num_convergence_steps.append(i_step + 1)
|
|
||||||
break
|
|
||||||
print(f"num_convergence_steps: {num_convergence_steps}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
app.run(main)
|
|
||||||
@@ -1,430 +0,0 @@
|
|||||||
# Copyright 2024 The OPRO Authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
r"""Optimize over the objective function of a traveling salesman problem.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
|
|
||||||
```
|
|
||||||
python optimize_tsp.py --optimizer="text-bison"
|
|
||||||
```
|
|
||||||
|
|
||||||
Note:
|
|
||||||
- When using a Google-Cloud-served model (like text-bison at
|
|
||||||
https://developers.generativeai.google/tutorials/text_quickstart), add
|
|
||||||
`--palm_api_key="<your_key>"`
|
|
||||||
- When using an OpenAI model, add `--openai_api_key="<your_key>"`
|
|
||||||
"""
|
|
||||||
|
|
||||||
import datetime
|
|
||||||
import functools
|
|
||||||
import getpass
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
OPRO_ROOT_PATH = os.path.dirname(
|
|
||||||
os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
|
|
||||||
)
|
|
||||||
sys.path.insert(0, OPRO_ROOT_PATH)
|
|
||||||
|
|
||||||
from absl import app
|
|
||||||
from absl import flags
|
|
||||||
import google.generativeai as palm
|
|
||||||
import numpy as np
|
|
||||||
import openai
|
|
||||||
|
|
||||||
from opro import prompt_utils
|
|
||||||
|
|
||||||
|
|
||||||
_OPENAI_API_KEY = flags.DEFINE_string(
|
|
||||||
"openai_api_key", "", "The OpenAI API key."
|
|
||||||
)
|
|
||||||
|
|
||||||
_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
|
|
||||||
|
|
||||||
_OPTIMIZER = flags.DEFINE_string(
|
|
||||||
"optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
|
|
||||||
)
|
|
||||||
|
|
||||||
_START_ALGORITHM = flags.DEFINE_string(
|
|
||||||
"starting_algorithm", "farthest_insertion", "The name of the starting algorithm. Select from [dp, nearest_neighbor, farthest_insertion]"
|
|
||||||
)
|
|
||||||
|
|
||||||
def main(_):
|
|
||||||
# ============== set optimization experiment configurations ================
|
|
||||||
num_points = 100 # number of points in TSP
|
|
||||||
num_steps = 500 # the number of optimization steps
|
|
||||||
max_num_pairs = 10 # the maximum number of input-output pairs in meta-prompt
|
|
||||||
num_decimals = 0 # num of decimals for distances in meta-prompt
|
|
||||||
num_starting_points = 5 # the number of initial points for optimization
|
|
||||||
num_decode_per_step = 8 # the number of decoded solutions per step
|
|
||||||
|
|
||||||
# ================ load LLM settings ===================
|
|
||||||
optimizer_llm_name = _OPTIMIZER.value
|
|
||||||
assert optimizer_llm_name in {
|
|
||||||
"text-bison",
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
"gpt-4",
|
|
||||||
}
|
|
||||||
openai_api_key = _OPENAI_API_KEY.value
|
|
||||||
palm_api_key = _PALM_API_KEY.value
|
|
||||||
|
|
||||||
if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
|
|
||||||
assert openai_api_key, "The OpenAI API key must be provided."
|
|
||||||
openai.api_key = openai_api_key
|
|
||||||
else:
|
|
||||||
assert optimizer_llm_name == "text-bison"
|
|
||||||
assert (
|
|
||||||
palm_api_key
|
|
||||||
), "A PaLM API key is needed when prompting the text-bison model."
|
|
||||||
palm.configure(api_key=palm_api_key)
|
|
||||||
|
|
||||||
# =================== create the result directory ==========================
|
|
||||||
datetime_str = (
|
|
||||||
str(datetime.datetime.now().replace(microsecond=0))
|
|
||||||
.replace(" ", "-")
|
|
||||||
.replace(":", "-")
|
|
||||||
)
|
|
||||||
|
|
||||||
save_folder = os.path.join(
|
|
||||||
OPRO_ROOT_PATH,
|
|
||||||
"outputs",
|
|
||||||
"optimization-results",
|
|
||||||
f"tsp-o-{optimizer_llm_name}-{datetime_str}/",
|
|
||||||
)
|
|
||||||
os.makedirs(save_folder)
|
|
||||||
print(f"result directory:\n{save_folder}")
|
|
||||||
|
|
||||||
# ====================== optimizer model configs ============================
|
|
||||||
if optimizer_llm_name.lower() == "text-bison":
|
|
||||||
# when prompting text-bison with Cloud API
|
|
||||||
optimizer_finetuned_palm_temperature = 1.0
|
|
||||||
optimizer_finetuned_palm_max_decode_steps = 1024
|
|
||||||
optimizer_finetuned_palm_batch_size = 1
|
|
||||||
optimizer_finetuned_palm_num_servers = 1
|
|
||||||
optimizer_finetuned_palm_dict = dict()
|
|
||||||
optimizer_finetuned_palm_dict["temperature"] = (
|
|
||||||
optimizer_finetuned_palm_temperature
|
|
||||||
)
|
|
||||||
optimizer_finetuned_palm_dict["batch_size"] = (
|
|
||||||
optimizer_finetuned_palm_batch_size
|
|
||||||
)
|
|
||||||
optimizer_finetuned_palm_dict["num_servers"] = (
|
|
||||||
optimizer_finetuned_palm_num_servers
|
|
||||||
)
|
|
||||||
optimizer_finetuned_palm_dict["max_decode_steps"] = (
|
|
||||||
optimizer_finetuned_palm_max_decode_steps
|
|
||||||
)
|
|
||||||
|
|
||||||
call_optimizer_finetuned_palm_server_func = functools.partial(
|
|
||||||
prompt_utils.call_palm_server_from_cloud,
|
|
||||||
# prompt_utils.call_vllm,
|
|
||||||
model="text-bison-001",
|
|
||||||
temperature=optimizer_finetuned_palm_dict["temperature"],
|
|
||||||
max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
|
|
||||||
)
|
|
||||||
|
|
||||||
optimizer_llm_dict = {
|
|
||||||
"model_type": optimizer_llm_name.lower(),
|
|
||||||
}
|
|
||||||
optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
|
|
||||||
call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
|
|
||||||
|
|
||||||
else:
|
|
||||||
assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
|
|
||||||
optimizer_gpt_max_decode_steps = 1024
|
|
||||||
optimizer_gpt_temperature = 1.0
|
|
||||||
|
|
||||||
optimizer_llm_dict = dict()
|
|
||||||
optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps
|
|
||||||
optimizer_llm_dict["temperature"] = optimizer_gpt_temperature
|
|
||||||
optimizer_llm_dict["batch_size"] = 1
|
|
||||||
call_optimizer_server_func = functools.partial(
|
|
||||||
prompt_utils.call_openai_server_func,
|
|
||||||
model=optimizer_llm_name,
|
|
||||||
max_decode_steps=optimizer_gpt_max_decode_steps,
|
|
||||||
temperature=optimizer_gpt_temperature,
|
|
||||||
)
|
|
||||||
|
|
||||||
# ====================== try calling the servers ============================
|
|
||||||
print("\n======== testing the optimizer server ===========")
|
|
||||||
optimizer_test_output = call_optimizer_server_func(
|
|
||||||
"Does the sun rise from the north? Just answer yes or no.",
|
|
||||||
temperature=1.0,
|
|
||||||
)
|
|
||||||
print(f"optimizer test output: {optimizer_test_output}")
|
|
||||||
print("Finished testing the optimizer server.")
|
|
||||||
print("\n=================================================")
|
|
||||||
|
|
||||||
# ====================== utility functions ============================
|
|
||||||
def evaluate_distance(x, y, trace, num_decimals): # pylint: disable=invalid-name
|
|
||||||
dis = 0
|
|
||||||
try:
|
|
||||||
for i in range(len(trace) - 1):
|
|
||||||
id0 = trace[i]
|
|
||||||
id1 = trace[i + 1]
|
|
||||||
dis += np.sqrt((x[id0] - x[id1]) ** 2 + (y[id0] - y[id1]) ** 2)
|
|
||||||
except:
|
|
||||||
return -1
|
|
||||||
id0 = trace[-1]
|
|
||||||
id1 = trace[0]
|
|
||||||
dis += np.sqrt((x[id0] - x[id1]) ** 2 + (y[id0] - y[id1]) ** 2)
|
|
||||||
dis = np.round(dis, num_decimals) if num_decimals > 0 else int(dis)
|
|
||||||
return dis
|
|
||||||
|
|
||||||
def solve_tsp(x, y, num_points, num_decimals, starting_algorithm):
|
|
||||||
if starting_algorithm == "nearest_neighbor":
|
|
||||||
min_dis = 0
|
|
||||||
gt_sol = [0]
|
|
||||||
remaining_points = list(range(1, num_points))
|
|
||||||
while len(remaining_points) > 0:
|
|
||||||
min_p = -1
|
|
||||||
min_cur_dis = -1
|
|
||||||
for p in remaining_points:
|
|
||||||
cur_dis = np.sqrt((x[p] - x[gt_sol[-1]]) ** 2 + (y[p] - y[gt_sol[-1]]) ** 2)
|
|
||||||
if min_p == -1 or cur_dis < min_cur_dis:
|
|
||||||
min_p = p
|
|
||||||
min_cur_dis = cur_dis
|
|
||||||
gt_sol.append(min_p)
|
|
||||||
min_dis += min_cur_dis
|
|
||||||
remaining_points.remove(min_p)
|
|
||||||
min_dis += np.sqrt((x[0] - x[gt_sol[-1]]) ** 2 + (y[0] - y[gt_sol[-1]]) ** 2)
|
|
||||||
min_dis = np.round(min_dis, num_decimals) if num_decimals > 0 else int(min_dis)
|
|
||||||
return gt_sol, min_dis
|
|
||||||
elif starting_algorithm == "farthest_insertion":
|
|
||||||
gt_sol = [0]
|
|
||||||
remaining_points = list(range(1, num_points))
|
|
||||||
while len(remaining_points) > 0:
|
|
||||||
max_p = -1
|
|
||||||
max_cur_dis = -1
|
|
||||||
max_cur_index = -1
|
|
||||||
for p in remaining_points:
|
|
||||||
min_cur_dis = -1
|
|
||||||
min_cur_index = -1
|
|
||||||
for index in range(1, len(gt_sol) + 1):
|
|
||||||
new_sol = gt_sol[:index] + [p] + gt_sol[index:]
|
|
||||||
cur_dis = evaluate_distance(x, y, new_sol, num_decimals)
|
|
||||||
if min_cur_dis == -1 or cur_dis < min_cur_dis:
|
|
||||||
min_cur_dis = cur_dis
|
|
||||||
min_cur_index = index
|
|
||||||
if max_cur_dis == -1 or min_cur_dis > max_cur_dis:
|
|
||||||
max_p = p
|
|
||||||
max_cur_dis = min_cur_dis
|
|
||||||
max_cur_index = min_cur_index
|
|
||||||
gt_sol = gt_sol[:max_cur_index] + [max_p] + gt_sol[max_cur_index:]
|
|
||||||
remaining_points.remove(max_p)
|
|
||||||
min_dis = evaluate_distance(x, y, gt_sol, num_decimals)
|
|
||||||
return gt_sol, min_dis
|
|
||||||
|
|
||||||
f = {(0, 1): (0, [0])}
|
|
||||||
q = [(0, 1)]
|
|
||||||
min_dis = -1
|
|
||||||
gt_sol = list(range(num_points))
|
|
||||||
while len(q) > 0:
|
|
||||||
p, status = q[0]
|
|
||||||
q = q[1:]
|
|
||||||
for i in range(num_points):
|
|
||||||
if 2 << i >> 1 & status == 0:
|
|
||||||
new_status = status + (2 << i >> 1)
|
|
||||||
new_dis = f[(p, status)][0] + np.sqrt((x[i] - x[p]) ** 2 + (y[i] - y[p]) ** 2)
|
|
||||||
if (i, new_status) not in f or new_dis < f[(i, new_status)][0]:
|
|
||||||
f[(i, new_status)] = (new_dis, f[(p, status)][1] + [i])
|
|
||||||
if new_status == (2 << num_points >> 1) - 1:
|
|
||||||
new_dis += np.sqrt((x[i] - x[0]) ** 2 + (y[i] - y[0]) ** 2)
|
|
||||||
if min_dis == -1 or new_dis < min_dis:
|
|
||||||
min_dis = new_dis
|
|
||||||
gt_sol = f[(i, new_status)][1][:]
|
|
||||||
elif (i, new_status) not in q:
|
|
||||||
q.append((i, new_status))
|
|
||||||
min_dis = np.round(min_dis, num_decimals) if num_decimals > 0 else int(min_dis)
|
|
||||||
return gt_sol, min_dis
|
|
||||||
|
|
||||||
def gen_meta_prompt(
|
|
||||||
old_value_pairs_set,
|
|
||||||
x, # pylint: disable=invalid-name
|
|
||||||
y,
|
|
||||||
max_num_pairs=100,
|
|
||||||
):
|
|
||||||
"""Generate the meta-prompt for optimization.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
old_value_pairs_set (set): the set of old traces.
|
|
||||||
X (np.array): the 1D array of x values.
|
|
||||||
y (np.array): the 1D array of y values.
|
|
||||||
num_decimals (int): the number of decimals in the
|
|
||||||
meta-prompt.
|
|
||||||
max_num_pairs (int): the maximum number of exemplars in the meta-prompt.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
meta_prompt (str): the generated meta-prompt.
|
|
||||||
"""
|
|
||||||
old_value_pairs = list(old_value_pairs_set)
|
|
||||||
old_value_pairs = sorted(old_value_pairs, key=lambda x: -x[1])[
|
|
||||||
-max_num_pairs:
|
|
||||||
]
|
|
||||||
old_value_pairs_substr = ""
|
|
||||||
for trace, dis in old_value_pairs:
|
|
||||||
old_value_pairs_substr += f"\n<trace> {trace} </trace>\nlength:\n{dis}\n"
|
|
||||||
meta_prompt = "You are given a list of points with coordinates below:\n"
|
|
||||||
for i, (xi, yi) in enumerate(zip(x, y)):
|
|
||||||
if i:
|
|
||||||
meta_prompt += ", "
|
|
||||||
meta_prompt += f"({i}): ({xi}, {yi})"
|
|
||||||
meta_prompt += ".\n\nBelow are some previous traces and their lengths. The traces are arranged in descending order based on their lengths, where lower values are better.".strip()
|
|
||||||
meta_prompt += "\n\n"
|
|
||||||
meta_prompt += old_value_pairs_substr.strip()
|
|
||||||
meta_prompt += "\n\n"
|
|
||||||
meta_prompt += """Give me a new trace that is different from all traces above, and has a length lower than any of the above. The trace should traverse all points exactly once. The trace should start with '<trace>' and end with </trace>.
|
|
||||||
""".strip()
|
|
||||||
return meta_prompt
|
|
||||||
|
|
||||||
def extract_string(input_string):
|
|
||||||
start_string = "<trace>"
|
|
||||||
end_string = "</trace>"
|
|
||||||
if start_string not in input_string:
|
|
||||||
return ""
|
|
||||||
input_string = input_string[input_string.index(start_string) + len(start_string):]
|
|
||||||
if end_string not in input_string:
|
|
||||||
return ""
|
|
||||||
input_string = input_string[:input_string.index(end_string)]
|
|
||||||
parsed_list = []
|
|
||||||
for p in input_string.split(","):
|
|
||||||
p = p.strip()
|
|
||||||
try:
|
|
||||||
p = int(p)
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
parsed_list.append(p)
|
|
||||||
return parsed_list
|
|
||||||
|
|
||||||
# ================= generate the ground truth trace =====================
|
|
||||||
|
|
||||||
x = np.random.uniform(low=-100, high=100, size=num_points)
|
|
||||||
y = np.random.uniform(low=-100, high=100, size=num_points)
|
|
||||||
x = [np.round(xi, num_decimals) if num_decimals > 0 else int(xi) for xi in x]
|
|
||||||
y = [np.round(yi, num_decimals) if num_decimals > 0 else int(yi) for yi in y]
|
|
||||||
|
|
||||||
starting_algorithm = _START_ALGORITHM.value
|
|
||||||
|
|
||||||
gt_sol, min_dis = solve_tsp(x, y, num_points, num_decimals, starting_algorithm)
|
|
||||||
print("ground truth solution" + str(gt_sol))
|
|
||||||
print("min distance: ", min_dis)
|
|
||||||
gt_sol_str = ",".join([str(i) for i in gt_sol])
|
|
||||||
point_list = range(num_points)
|
|
||||||
init_sols = []
|
|
||||||
while len(init_sols) < num_starting_points:
|
|
||||||
sol = np.random.permutation(point_list)
|
|
||||||
if sol[0] != 0:
|
|
||||||
continue
|
|
||||||
sol_str = ",".join([str(i) for i in sol])
|
|
||||||
if sol_str == gt_sol_str:
|
|
||||||
continue
|
|
||||||
init_sols.append(list(sol))
|
|
||||||
|
|
||||||
# ====================== run optimization ============================
|
|
||||||
configs_dict = {
|
|
||||||
"num_starting_points": num_starting_points,
|
|
||||||
"num_decode_per_step": num_decode_per_step,
|
|
||||||
"optimizer_llm_configs": optimizer_llm_dict,
|
|
||||||
"data": {
|
|
||||||
"ground truth solution": [",".join([str(i) for i in gt_sol])],
|
|
||||||
"loss_at_true_values": min_dis,
|
|
||||||
"x": list(x),
|
|
||||||
"y": list(y),
|
|
||||||
},
|
|
||||||
"init_sols": [",".join([str(i) for i in sol]) for sol in init_sols],
|
|
||||||
"num_steps": num_steps,
|
|
||||||
"max_num_pairs": max_num_pairs,
|
|
||||||
"num_decimals": num_decimals,
|
|
||||||
}
|
|
||||||
configs_json_path = os.path.join(save_folder, "configs.json")
|
|
||||||
print(f"saving configs to\n{configs_json_path}")
|
|
||||||
with open(configs_json_path, "w") as f:
|
|
||||||
json.dump(configs_dict, f, indent=4)
|
|
||||||
|
|
||||||
old_value_pairs_set = set()
|
|
||||||
old_value_pairs_with_i_step = [] # format: [(trace, dis = f(trace), i_step)]
|
|
||||||
meta_prompts_dict = dict() # format: {i_step: meta_prompt}
|
|
||||||
raw_outputs_dict = dict() # format: {i_step: raw_outputs}
|
|
||||||
|
|
||||||
for sol in init_sols:
|
|
||||||
dis = evaluate_distance(x, y, sol, num_decimals)
|
|
||||||
sol_str = ",".join([str(i) for i in sol])
|
|
||||||
old_value_pairs_set.add((sol_str, dis))
|
|
||||||
old_value_pairs_with_i_step.append((sol_str, dis, -1))
|
|
||||||
|
|
||||||
print("\n================ run optimization ==============")
|
|
||||||
print(f"initial points: {[tuple(item[:-1]) for item in old_value_pairs_set]}")
|
|
||||||
print(f"initial values: {[item[-1] for item in old_value_pairs_set]}")
|
|
||||||
results_json_path = os.path.join(save_folder, "results.json")
|
|
||||||
print(f"saving results to\n{results_json_path}")
|
|
||||||
|
|
||||||
for i_step in range(num_steps):
|
|
||||||
print(f"\nStep {i_step}:")
|
|
||||||
meta_prompt = gen_meta_prompt(
|
|
||||||
old_value_pairs_set,
|
|
||||||
x,
|
|
||||||
y,
|
|
||||||
max_num_pairs=max_num_pairs,
|
|
||||||
)
|
|
||||||
print("\n=================================================")
|
|
||||||
print(f"meta_prompt:\n{meta_prompt}")
|
|
||||||
meta_prompts_dict[i_step] = meta_prompt
|
|
||||||
raw_outputs = []
|
|
||||||
parsed_outputs = []
|
|
||||||
while len(parsed_outputs) < num_decode_per_step:
|
|
||||||
raw_output = call_optimizer_server_func(meta_prompt)
|
|
||||||
for string in raw_output:
|
|
||||||
print("\n=================================================")
|
|
||||||
print("raw output:\n", string)
|
|
||||||
try:
|
|
||||||
parsed_output = extract_string(string)
|
|
||||||
if parsed_output is not None and len(set(parsed_output)) == num_points and len(parsed_output) == num_points and parsed_output[0] == 0:
|
|
||||||
dis = evaluate_distance(x, y, parsed_output, num_decimals)
|
|
||||||
if dis == -1:
|
|
||||||
continue
|
|
||||||
parsed_outputs.append(parsed_output)
|
|
||||||
raw_outputs.append(string)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
print("\n=================================================")
|
|
||||||
print(f"proposed points: {parsed_outputs}")
|
|
||||||
raw_outputs_dict[i_step] = raw_outputs
|
|
||||||
|
|
||||||
# evaluate the values of proposed and rounded outputs
|
|
||||||
single_step_values = []
|
|
||||||
for trace in parsed_outputs:
|
|
||||||
dis = evaluate_distance(x, y, trace, num_decimals)
|
|
||||||
single_step_values.append(dis)
|
|
||||||
trace_str = ",".join([str(i) for i in trace])
|
|
||||||
old_value_pairs_set.add((trace_str, dis))
|
|
||||||
old_value_pairs_with_i_step.append((trace_str, dis, i_step))
|
|
||||||
print(f"single_step_values: {single_step_values}")
|
|
||||||
print("ground truth solution" + str(gt_sol))
|
|
||||||
print("min distance: ", min_dis)
|
|
||||||
|
|
||||||
# ====================== save results ============================
|
|
||||||
results_dict = {
|
|
||||||
"meta_prompts": meta_prompts_dict,
|
|
||||||
"raw_outputs": raw_outputs_dict,
|
|
||||||
"old_value_pairs_with_i_step": old_value_pairs_with_i_step,
|
|
||||||
}
|
|
||||||
with open(results_json_path, "w") as f:
|
|
||||||
json.dump(results_dict, f, indent=4)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
app.run(main)
|
|
||||||
@@ -1,967 +0,0 @@
|
|||||||
# Copyright 2023 The OPRO Authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
r"""The .py file for prompt optimization.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
|
|
||||||
Step 1: edit the starting instructions by modifying `initial_instructions`
|
|
||||||
|
|
||||||
Step 2: edit the training ratio by modifying `train_ratio`
|
|
||||||
|
|
||||||
Step 3: check if the model configs (like batch size) are the same as the actual serving configs
|
|
||||||
|
|
||||||
Step 4: run
|
|
||||||
|
|
||||||
```
|
|
||||||
python optimize_instructions.py \
|
|
||||||
--optimizer="gpt-3.5-turbo" --scorer="text-bison" \
|
|
||||||
--instruction_pos="A_begin" --dataset="gsm8k" --task="train"
|
|
||||||
```
|
|
||||||
|
|
||||||
The outputs will then be written to `outputs/optimization-results/` in the opro folder.
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
|
|
||||||
1. One or more API keys may need to be provided:
|
|
||||||
- When using a Google-Cloud-served model (like text-bison at https://developers.generativeai.google/tutorials/text_quickstart), add `--palm_api_key=<your_key>`
|
|
||||||
- When using an OpenAI model, add `--openai_api_key=”<your_key>”`
|
|
||||||
|
|
||||||
2. The initial instructions should be provided in the "initial_instructions"
|
|
||||||
variable.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import datetime
|
|
||||||
import functools
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
OPRO_ROOT_PATH = os.path.dirname(
|
|
||||||
os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
|
|
||||||
)
|
|
||||||
sys.path.insert(0, OPRO_ROOT_PATH)
|
|
||||||
|
|
||||||
from absl import app
|
|
||||||
from absl import flags
|
|
||||||
import google.generativeai as palm
|
|
||||||
import numpy as np
|
|
||||||
import openai
|
|
||||||
from opro import prompt_utils
|
|
||||||
from opro.optimization import opt_utils
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
ROOT_DATA_FOLDER_PATH = os.path.join(OPRO_ROOT_PATH, "data")
|
|
||||||
|
|
||||||
_LOCAL_MODEL_PATH = flags.DEFINE_string("local_model_path", None, "Path to local vLLM model.")
|
|
||||||
|
|
||||||
|
|
||||||
_OPENAI_API_KEY = flags.DEFINE_string(
|
|
||||||
"openai_api_key", "", "The OpenAI API key."
|
|
||||||
)
|
|
||||||
|
|
||||||
_PALM_API_KEY = flags.DEFINE_string("palm_api_key", "", "The PaLM API key.")
|
|
||||||
|
|
||||||
_SCORER = flags.DEFINE_string(
|
|
||||||
"scorer", "text-bison", "The name of the scorer LLM."
|
|
||||||
)
|
|
||||||
|
|
||||||
_OPTIMIZER = flags.DEFINE_string(
|
|
||||||
"optimizer", "gpt-3.5-turbo", "The name of the optimizer LLM."
|
|
||||||
)
|
|
||||||
|
|
||||||
_DATASET = flags.DEFINE_string(
|
|
||||||
"dataset", "gsm8k", "The name of dataset to search for instructions on."
|
|
||||||
)
|
|
||||||
|
|
||||||
_TASK = flags.DEFINE_string(
|
|
||||||
"task",
|
|
||||||
"train",
|
|
||||||
"The name of task within the above dataset to search for instructions on.",
|
|
||||||
)
|
|
||||||
|
|
||||||
_INSTRUCTION_POS = flags.DEFINE_string(
|
|
||||||
"instruction_pos",
|
|
||||||
"A_begin",
|
|
||||||
"The position of the instruction to search for.",
|
|
||||||
)
|
|
||||||
|
|
||||||
_META_PROMPT_TYPE = flags.DEFINE_string(
|
|
||||||
"meta_prompt_type",
|
|
||||||
"both_instructions_and_exemplars",
|
|
||||||
"The type of meta-prompt: whether to have both previous instructions and"
|
|
||||||
" dataset exemplars (often for fine-tuned optimizers), or to have only"
|
|
||||||
" previous instructions (often for pre-trained optimizers).",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def main(_):
|
|
||||||
openai_api_key = _OPENAI_API_KEY.value
|
|
||||||
palm_api_key = _PALM_API_KEY.value
|
|
||||||
scorer_llm_name = _SCORER.value
|
|
||||||
optimizer_llm_name = _OPTIMIZER.value
|
|
||||||
dataset_name = _DATASET.value.lower()
|
|
||||||
task_name = _TASK.value
|
|
||||||
meta_prompt_type = _META_PROMPT_TYPE.value
|
|
||||||
local_model_path = _LOCAL_MODEL_PATH.value
|
|
||||||
|
|
||||||
assert dataset_name in {
|
|
||||||
"mmlu",
|
|
||||||
"bbh",
|
|
||||||
"gsm8k",
|
|
||||||
}, "The lower-case dataset name must be one of mmlu, bbh, or gsm8k."
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
assert task_name in {
|
|
||||||
"STEM",
|
|
||||||
"humanities",
|
|
||||||
"social sciences",
|
|
||||||
"otheran (business, health, misc.)",
|
|
||||||
} # for now only support searching on one MMLU category
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
assert task_name in {
|
|
||||||
"boolean_expressions",
|
|
||||||
"causal_judgement",
|
|
||||||
"date_understanding",
|
|
||||||
"disambiguation_qa",
|
|
||||||
"dyck_languages",
|
|
||||||
"formal_fallacies",
|
|
||||||
"geometric_shapes",
|
|
||||||
"hyperbaton",
|
|
||||||
"logical_deduction_five_objects",
|
|
||||||
"logical_deduction_seven_objects",
|
|
||||||
"logical_deduction_three_objects",
|
|
||||||
"movie_recommendation",
|
|
||||||
"multistep_arithmetic_two",
|
|
||||||
"navigate",
|
|
||||||
"object_counting",
|
|
||||||
"penguins_in_a_table",
|
|
||||||
"reasoning_about_colored_objects",
|
|
||||||
"ruin_names",
|
|
||||||
"salient_translation_error_detection",
|
|
||||||
"snarks",
|
|
||||||
"sports_understanding",
|
|
||||||
"temporal_sequences",
|
|
||||||
"tracking_shuffled_objects_five_objects",
|
|
||||||
"tracking_shuffled_objects_seven_objects",
|
|
||||||
"tracking_shuffled_objects_three_objects",
|
|
||||||
"web_of_lies",
|
|
||||||
"word_sorting",
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
assert dataset_name == "gsm8k"
|
|
||||||
assert task_name in {"train", "test"}
|
|
||||||
|
|
||||||
assert scorer_llm_name in {
|
|
||||||
"text-bison",
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
"gpt-4",
|
|
||||||
"local",
|
|
||||||
}
|
|
||||||
assert optimizer_llm_name in {
|
|
||||||
"text-bison",
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
"gpt-4",
|
|
||||||
"local",
|
|
||||||
}
|
|
||||||
assert meta_prompt_type in {
|
|
||||||
"both_instructions_and_exemplars",
|
|
||||||
"instructions_only",
|
|
||||||
}
|
|
||||||
|
|
||||||
instruction_pos = _INSTRUCTION_POS.value
|
|
||||||
assert instruction_pos in {
|
|
||||||
"before_Q",
|
|
||||||
"Q_begin",
|
|
||||||
"Q_end",
|
|
||||||
"A_begin",
|
|
||||||
}, (
|
|
||||||
"The instruction position should be either before the question, or at the"
|
|
||||||
" beginning of the question, at the end of the question, or at the"
|
|
||||||
" beginning of the answer."
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
f"scorer: {scorer_llm_name}, optimizer: {optimizer_llm_name}, dataset:"
|
|
||||||
f" {dataset_name}, task: {task_name}, instruction_pos: {instruction_pos}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# make sure the scorer and optimizer models are callable
|
|
||||||
|
|
||||||
if scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
|
|
||||||
assert openai_api_key, "The OpenAI API key must be provided."
|
|
||||||
openai.api_key = openai_api_key
|
|
||||||
elif scorer_llm_name == "text-bison":
|
|
||||||
assert scorer_llm_name == "text-bison"
|
|
||||||
assert (
|
|
||||||
palm_api_key
|
|
||||||
), "A PaLM API key is needed when prompting the text-bison model."
|
|
||||||
palm.configure(api_key=palm_api_key)
|
|
||||||
|
|
||||||
elif scorer_llm_name == "local":
|
|
||||||
assert local_model_path, "The local model path must be provided."
|
|
||||||
assert os.path.exists(local_model_path), (
|
|
||||||
f"The local model path {local_model_path} does not exist."
|
|
||||||
)
|
|
||||||
# set the local model path for vLLM
|
|
||||||
# prompt_utils.call_local_server_func(local_model_path)
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unknown scorer_llm_name: {scorer_llm_name}. "
|
|
||||||
"It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local."
|
|
||||||
)
|
|
||||||
|
|
||||||
if optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
|
|
||||||
assert openai_api_key, "The OpenAI API key must be provided."
|
|
||||||
openai.api_key = openai_api_key
|
|
||||||
elif optimizer_llm_name == "text-bison":
|
|
||||||
assert optimizer_llm_name == "text-bison"
|
|
||||||
assert (
|
|
||||||
palm_api_key
|
|
||||||
), "A PaLM API key is needed when prompting the text-bison model."
|
|
||||||
palm.configure(api_key=palm_api_key)
|
|
||||||
|
|
||||||
elif optimizer_llm_name == "local":
|
|
||||||
assert local_model_path, "The local model path must be provided."
|
|
||||||
assert os.path.exists(local_model_path), (
|
|
||||||
f"The local model path {local_model_path} does not exist."
|
|
||||||
)
|
|
||||||
# set the local model path for vLLM
|
|
||||||
# prompt_utils.call_local_server_func(local_model_path)
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unknown scorer_llm_name: {optimizer_llm_name}. "
|
|
||||||
"It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "MMLU-data")
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
root_data_folder_path = os.path.join(
|
|
||||||
ROOT_DATA_FOLDER_PATH, "BIG-Bench-Hard-data/"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
assert dataset_name == "gsm8k"
|
|
||||||
root_data_folder_path = os.path.join(ROOT_DATA_FOLDER_PATH, "gsm_data")
|
|
||||||
|
|
||||||
# =================== create the result directory ==========================
|
|
||||||
datetime_str = (
|
|
||||||
str(datetime.datetime.now().replace(microsecond=0))
|
|
||||||
.replace(" ", "-")
|
|
||||||
.replace(":", "-")
|
|
||||||
)
|
|
||||||
|
|
||||||
save_folder = os.path.join(
|
|
||||||
OPRO_ROOT_PATH,
|
|
||||||
"outputs",
|
|
||||||
"optimization-results",
|
|
||||||
f"{dataset_name.upper()}-{task_name}-s-{scorer_llm_name}-o-{optimizer_llm_name}-{datetime_str}/",
|
|
||||||
)
|
|
||||||
result_by_instruction_folder = os.path.join(
|
|
||||||
save_folder, "result_by_instruction"
|
|
||||||
)
|
|
||||||
os.makedirs(result_by_instruction_folder)
|
|
||||||
print(f"result directory:\n{save_folder}")
|
|
||||||
|
|
||||||
# ====================== scorer model configs ==============================
|
|
||||||
# difference between num_decodes and batch_size:
|
|
||||||
# - num_decodes: how many outputs we actually want for each input
|
|
||||||
# - batch_size: the batch size in model serving, should equal to that in
|
|
||||||
# model serving config
|
|
||||||
# 常量定义
|
|
||||||
DEFAULT_MAX_TOKENS = 1024
|
|
||||||
DEFAULT_TEMPERATURE = 0.0
|
|
||||||
PALM_MODEL_NAME = "text-bison-001"
|
|
||||||
|
|
||||||
if scorer_llm_name == "text-bison":
|
|
||||||
config = {
|
|
||||||
"temperature": DEFAULT_TEMPERATURE,
|
|
||||||
"max_decode_steps": DEFAULT_MAX_TOKENS,
|
|
||||||
"batch_size": 1,
|
|
||||||
"num_servers": 1,
|
|
||||||
}
|
|
||||||
call_scorer_server_func = functools.partial(
|
|
||||||
prompt_utils.call_palm_server_from_cloud,
|
|
||||||
model=PALM_MODEL_NAME,
|
|
||||||
**config
|
|
||||||
)
|
|
||||||
scorer_llm_dict = {"model_type": "text-bison", **config}
|
|
||||||
|
|
||||||
elif scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
|
|
||||||
config = {
|
|
||||||
"temperature": DEFAULT_TEMPERATURE,
|
|
||||||
"max_decode_steps": DEFAULT_MAX_TOKENS,
|
|
||||||
"batch_size": 1,
|
|
||||||
"num_servers": 1,
|
|
||||||
}
|
|
||||||
call_scorer_server_func = functools.partial(
|
|
||||||
prompt_utils.call_openai_server_func,
|
|
||||||
model=scorer_llm_name.lower(),
|
|
||||||
**config
|
|
||||||
)
|
|
||||||
scorer_llm_dict = {"model_type": scorer_llm_name.lower(), **config}
|
|
||||||
|
|
||||||
elif scorer_llm_name == "local":
|
|
||||||
print(f"[DEBUG] local_model_path: {local_model_path}")
|
|
||||||
assert local_model_path, "Local model path must be provided."
|
|
||||||
config = {
|
|
||||||
"temperature": DEFAULT_TEMPERATURE,
|
|
||||||
"max_decode_steps": DEFAULT_MAX_TOKENS,
|
|
||||||
"batch_size": 8,
|
|
||||||
"num_servers": 8,# number of servers to use for local model
|
|
||||||
}
|
|
||||||
call_scorer_server_func = functools.partial(
|
|
||||||
prompt_utils.call_local_server_func,
|
|
||||||
local_model_path=local_model_path,
|
|
||||||
**config
|
|
||||||
)
|
|
||||||
scorer_llm_dict = {"model_type": "local", **config}
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported model: {scorer_llm_name}")
|
|
||||||
|
|
||||||
# if scorer_llm_name == "text-bison":
|
|
||||||
# # when prompting text-bison with Cloud API
|
|
||||||
# scorer_finetuned_palm_temperature = 0.0
|
|
||||||
# scorer_finetuned_palm_max_decode_steps = 1024
|
|
||||||
# scorer_finetuned_palm_batch_size = 1
|
|
||||||
# scorer_finetuned_palm_num_servers = 1
|
|
||||||
# scorer_finetuned_palm_dict = dict()
|
|
||||||
# scorer_finetuned_palm_dict["temperature"] = (
|
|
||||||
# scorer_finetuned_palm_temperature
|
|
||||||
# )
|
|
||||||
# scorer_finetuned_palm_dict["num_servers"] = (
|
|
||||||
# scorer_finetuned_palm_num_servers
|
|
||||||
# )
|
|
||||||
# scorer_finetuned_palm_dict["batch_size"] = scorer_finetuned_palm_batch_size
|
|
||||||
# scorer_finetuned_palm_dict["max_decode_steps"] = (
|
|
||||||
# scorer_finetuned_palm_max_decode_steps
|
|
||||||
# )
|
|
||||||
|
|
||||||
# call_scorer_finetuned_palm_server_func = functools.partial(
|
|
||||||
# prompt_utils.call_palm_server_from_cloud,
|
|
||||||
# model="text-bison-001",
|
|
||||||
# temperature=scorer_finetuned_palm_dict["temperature"],
|
|
||||||
# max_decode_steps=scorer_finetuned_palm_dict["max_decode_steps"],
|
|
||||||
# )
|
|
||||||
|
|
||||||
# scorer_llm_dict = {
|
|
||||||
# "model_type": scorer_llm_name.lower(),
|
|
||||||
# }
|
|
||||||
# scorer_llm_dict.update(scorer_finetuned_palm_dict)
|
|
||||||
# call_scorer_server_func = call_scorer_finetuned_palm_server_func
|
|
||||||
|
|
||||||
# elif scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}:
|
|
||||||
# # assert scorer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4"}
|
|
||||||
# scorer_gpt_max_decode_steps = 1024
|
|
||||||
# scorer_gpt_temperature = 0.0
|
|
||||||
|
|
||||||
# scorer_gpt_dict = dict()
|
|
||||||
# scorer_gpt_dict["max_decode_steps"] = scorer_gpt_max_decode_steps
|
|
||||||
# scorer_gpt_dict["temperature"] = scorer_gpt_temperature
|
|
||||||
# scorer_gpt_dict["num_decodes"] = 1
|
|
||||||
# scorer_gpt_dict["batch_size"] = 1
|
|
||||||
# scorer_gpt_dict["num_servers"] = 1
|
|
||||||
|
|
||||||
# scorer_llm_dict = {
|
|
||||||
# "model_type": scorer_llm_name.lower(),
|
|
||||||
# }
|
|
||||||
# scorer_llm_dict.update(scorer_gpt_dict)
|
|
||||||
# call_scorer_server_func = functools.partial(
|
|
||||||
# prompt_utils.call_openai_server_func,
|
|
||||||
# model=scorer_llm_name.lower(),
|
|
||||||
# max_decode_steps=scorer_gpt_max_decode_steps,
|
|
||||||
# temperature=scorer_gpt_temperature,
|
|
||||||
# )
|
|
||||||
# elif scorer_llm_name == "local":
|
|
||||||
# # local vLLM model
|
|
||||||
# scorer_local_max_decode_steps = 1024
|
|
||||||
# scorer_local_temperature = 0.0
|
|
||||||
# call_scorer_server_func = functools.partial(
|
|
||||||
# prompt_utils.call_local_model_server_func,
|
|
||||||
# model_path=local_model_path,
|
|
||||||
# max_decode_steps=scorer_local_max_decode_steps,
|
|
||||||
# temperature=scorer_local_temperature,
|
|
||||||
# )
|
|
||||||
# else:
|
|
||||||
# raise ValueError(
|
|
||||||
# f"Unknown scorer_llm_name: {scorer_llm_name}. "
|
|
||||||
# "It should be one of text-bison, gpt-3.5-turbo, gpt-4, or local."
|
|
||||||
# )
|
|
||||||
|
|
||||||
# ====================== optimizer model configs ============================
|
|
||||||
if optimizer_llm_name.lower() == "text-bison":
|
|
||||||
# PaLM text-bison 模型配置
|
|
||||||
optimizer_llm_dict = {
|
|
||||||
"model_type": "text-bison",
|
|
||||||
"temperature": 1.0, # 更高的随机性以生成多样化解
|
|
||||||
"max_decode_steps": 1024, # 最大生成长度
|
|
||||||
"batch_size": 1, # 单样本处理
|
|
||||||
"num_decodes": 8, # 生成8个候选结果
|
|
||||||
"num_servers": 1 # 单服务器
|
|
||||||
}
|
|
||||||
|
|
||||||
call_optimizer_server_func = functools.partial(
|
|
||||||
prompt_utils.call_palm_server_from_cloud,
|
|
||||||
model="text-bison-001",
|
|
||||||
temperature=optimizer_llm_dict["temperature"],
|
|
||||||
max_decode_steps=optimizer_llm_dict["max_decode_steps"],
|
|
||||||
)
|
|
||||||
|
|
||||||
elif optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4"}:
|
|
||||||
# GPT 模型配置
|
|
||||||
optimizer_llm_dict = {
|
|
||||||
"model_type": optimizer_llm_name.lower(),
|
|
||||||
"temperature": 1.0, # 更高的随机性
|
|
||||||
"max_decode_steps": 512, # 较短的最大长度
|
|
||||||
"batch_size": 1,
|
|
||||||
"num_decodes": 1 , # 单次生成
|
|
||||||
"num_servers": 1 # 单服务器
|
|
||||||
}
|
|
||||||
|
|
||||||
call_optimizer_server_func = functools.partial(
|
|
||||||
prompt_utils.call_openai_server_func,
|
|
||||||
model=optimizer_llm_name,
|
|
||||||
max_decode_steps=optimizer_llm_dict["max_decode_steps"],
|
|
||||||
temperature=optimizer_llm_dict["temperature"],
|
|
||||||
)
|
|
||||||
elif optimizer_llm_name.lower() == "local":
|
|
||||||
assert local_model_path, "Local model path must be provided."
|
|
||||||
optimizer_llm_dict = {
|
|
||||||
"model_type": optimizer_llm_name.lower(),
|
|
||||||
"temperature": 1.0, # 更高的随机性
|
|
||||||
"max_decode_steps": 512, # 较短的最大长度
|
|
||||||
"batch_size": 8,
|
|
||||||
"num_decodes": 1 , # 单次生成
|
|
||||||
"num_servers": 8 # 单服务器
|
|
||||||
}
|
|
||||||
call_optimizer_server_func = functools.partial(
|
|
||||||
prompt_utils.call_local_server_func,
|
|
||||||
local_model_path=local_model_path,
|
|
||||||
max_decode_steps=optimizer_llm_dict["max_decode_steps"],
|
|
||||||
temperature=optimizer_llm_dict["temperature"],
|
|
||||||
)
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unsupported optimizer model: {optimizer_llm_name}. "
|
|
||||||
"Must be one of: text-bison, gpt-3.5-turbo, gpt-4"
|
|
||||||
)
|
|
||||||
|
|
||||||
# if optimizer_llm_name.lower() == "text-bison":
|
|
||||||
# # when prompting text-bison with Cloud API
|
|
||||||
# optimizer_finetuned_palm_temperature = 1.0
|
|
||||||
# optimizer_finetuned_palm_num_decodes = 8
|
|
||||||
# optimizer_finetuned_palm_max_decode_steps = 1024
|
|
||||||
# optimizer_finetuned_palm_batch_size = 1
|
|
||||||
# optimizer_finetuned_palm_num_servers = 1
|
|
||||||
# optimizer_finetuned_palm_dict = dict()
|
|
||||||
# optimizer_finetuned_palm_dict["temperature"] = (
|
|
||||||
# optimizer_finetuned_palm_temperature
|
|
||||||
# )
|
|
||||||
# optimizer_finetuned_palm_dict["num_decodes"] = (
|
|
||||||
# optimizer_finetuned_palm_num_decodes
|
|
||||||
# )
|
|
||||||
# optimizer_finetuned_palm_dict["batch_size"] = (
|
|
||||||
# optimizer_finetuned_palm_batch_size
|
|
||||||
# )
|
|
||||||
# optimizer_finetuned_palm_dict["num_servers"] = (
|
|
||||||
# optimizer_finetuned_palm_num_servers
|
|
||||||
# )
|
|
||||||
# optimizer_finetuned_palm_dict["max_decode_steps"] = (
|
|
||||||
# optimizer_finetuned_palm_max_decode_steps
|
|
||||||
# )
|
|
||||||
|
|
||||||
# call_optimizer_finetuned_palm_server_func = functools.partial(
|
|
||||||
# prompt_utils.call_palm_server_from_cloud,
|
|
||||||
# model="text-bison-001",
|
|
||||||
# temperature=optimizer_finetuned_palm_dict["temperature"],
|
|
||||||
# max_decode_steps=optimizer_finetuned_palm_dict["max_decode_steps"],
|
|
||||||
# )
|
|
||||||
|
|
||||||
# optimizer_llm_dict = {
|
|
||||||
# "model_type": optimizer_llm_name.lower(),
|
|
||||||
# }
|
|
||||||
# optimizer_llm_dict.update(optimizer_finetuned_palm_dict)
|
|
||||||
# call_optimizer_server_func = call_optimizer_finetuned_palm_server_func
|
|
||||||
|
|
||||||
# else:
|
|
||||||
# assert optimizer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
|
|
||||||
# optimizer_gpt_max_decode_steps = 512
|
|
||||||
# optimizer_gpt_temperature = 1.0
|
|
||||||
|
|
||||||
# optimizer_llm_dict = dict()
|
|
||||||
# optimizer_llm_dict["max_decode_steps"] = optimizer_gpt_max_decode_steps
|
|
||||||
# optimizer_llm_dict["temperature"] = optimizer_gpt_temperature
|
|
||||||
# optimizer_llm_dict["batch_size"] = 1
|
|
||||||
# optimizer_llm_dict["num_decodes"] = 1
|
|
||||||
# call_optimizer_server_func = functools.partial(
|
|
||||||
# prompt_utils.call_openai_server_func,
|
|
||||||
# model=optimizer_llm_name,
|
|
||||||
# max_decode_steps=optimizer_gpt_max_decode_steps,
|
|
||||||
# temperature=optimizer_gpt_temperature,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# ====================== try calling the servers ============================
|
|
||||||
print("\n======== testing the scorer and optimizer servers ===========")
|
|
||||||
scorer_test_output = call_scorer_server_func(
|
|
||||||
"Does the sun rise from the north? Just answer yes or no."
|
|
||||||
)
|
|
||||||
print(f"number of scorer output decodes: {len(scorer_test_output)}")
|
|
||||||
print(f"scorer test output: {scorer_test_output}")
|
|
||||||
optimizer_test_output = call_optimizer_server_func(
|
|
||||||
"Does the sun rise from the north? Just answer yes or no.",
|
|
||||||
temperature=1.0,
|
|
||||||
)
|
|
||||||
print(f"number of optimizer output decodes: {len(optimizer_test_output)}")
|
|
||||||
print(f"optimizer test output: {optimizer_test_output}")
|
|
||||||
print("Finished testing the servers.")
|
|
||||||
|
|
||||||
# ====================== read data ============================
|
|
||||||
print("\n================ prompt optimization settings ==============")
|
|
||||||
# from https://github.com/hendrycks/test/blob/master/categories.py
|
|
||||||
subcategories = {
|
|
||||||
"abstract_algebra": ["math"],
|
|
||||||
"anatomy": ["health"],
|
|
||||||
"astronomy": ["physics"],
|
|
||||||
"business_ethics": ["business"],
|
|
||||||
"clinical_knowledge": ["health"],
|
|
||||||
"college_biology": ["biology"],
|
|
||||||
"college_chemistry": ["chemistry"],
|
|
||||||
"college_computer_science": ["computer science"],
|
|
||||||
"college_mathematics": ["math"],
|
|
||||||
"college_medicine": ["health"],
|
|
||||||
"college_physics": ["physics"],
|
|
||||||
"computer_security": ["computer science"],
|
|
||||||
"conceptual_physics": ["physics"],
|
|
||||||
"econometrics": ["economics"],
|
|
||||||
"electrical_engineering": ["engineering"],
|
|
||||||
"elementary_mathematics": ["math"],
|
|
||||||
"formal_logic": ["philosophy"],
|
|
||||||
"global_facts": ["other"],
|
|
||||||
"high_school_biology": ["biology"],
|
|
||||||
"high_school_chemistry": ["chemistry"],
|
|
||||||
"high_school_computer_science": ["computer science"],
|
|
||||||
"high_school_european_history": ["history"],
|
|
||||||
"high_school_geography": ["geography"],
|
|
||||||
"high_school_government_and_politics": ["politics"],
|
|
||||||
"high_school_macroeconomics": ["economics"],
|
|
||||||
"high_school_mathematics": ["math"],
|
|
||||||
"high_school_microeconomics": ["economics"],
|
|
||||||
"high_school_physics": ["physics"],
|
|
||||||
"high_school_psychology": ["psychology"],
|
|
||||||
"high_school_statistics": ["math"],
|
|
||||||
"high_school_us_history": ["history"],
|
|
||||||
"high_school_world_history": ["history"],
|
|
||||||
"human_aging": ["health"],
|
|
||||||
"human_sexuality": ["culture"],
|
|
||||||
"international_law": ["law"],
|
|
||||||
"jurisprudence": ["law"],
|
|
||||||
"logical_fallacies": ["philosophy"],
|
|
||||||
"machine_learning": ["computer science"],
|
|
||||||
"management": ["business"],
|
|
||||||
"marketing": ["business"],
|
|
||||||
"medical_genetics": ["health"],
|
|
||||||
"miscellaneous": ["other"],
|
|
||||||
"moral_disputes": ["philosophy"],
|
|
||||||
"moral_scenarios": ["philosophy"],
|
|
||||||
"nutrition": ["health"],
|
|
||||||
"philosophy": ["philosophy"],
|
|
||||||
"prehistory": ["history"],
|
|
||||||
"professional_accounting": ["other"],
|
|
||||||
"professional_law": ["law"],
|
|
||||||
"professional_medicine": ["health"],
|
|
||||||
"professional_psychology": ["psychology"],
|
|
||||||
"public_relations": ["politics"],
|
|
||||||
"security_studies": ["politics"],
|
|
||||||
"sociology": ["culture"],
|
|
||||||
"us_foreign_policy": ["politics"],
|
|
||||||
"virology": ["health"],
|
|
||||||
"world_religions": ["philosophy"],
|
|
||||||
}
|
|
||||||
|
|
||||||
categories = {
|
|
||||||
"STEM": [
|
|
||||||
"physics",
|
|
||||||
"chemistry",
|
|
||||||
"biology",
|
|
||||||
"computer science",
|
|
||||||
"math",
|
|
||||||
"engineering",
|
|
||||||
],
|
|
||||||
"humanities": ["history", "philosophy", "law"],
|
|
||||||
"social sciences": [
|
|
||||||
"politics",
|
|
||||||
"culture",
|
|
||||||
"economics",
|
|
||||||
"geography",
|
|
||||||
"psychology",
|
|
||||||
],
|
|
||||||
"other (business, health, misc.)": ["other", "business", "health"],
|
|
||||||
}
|
|
||||||
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
# EITHER: filter by category
|
|
||||||
# category_names = [
|
|
||||||
# "STEM",
|
|
||||||
# "humanities",
|
|
||||||
# "social sciences",
|
|
||||||
# "other (business, health, misc.)",
|
|
||||||
# ]
|
|
||||||
category_names = [task_name]
|
|
||||||
folder_name = "test" # one of {'auxiliary_train', 'dev', 'val', 'test'}
|
|
||||||
task_names = []
|
|
||||||
for task_csv_name in os.listdir(
|
|
||||||
os.path.join(root_data_folder_path, folder_name)
|
|
||||||
):
|
|
||||||
task_names.append(task_csv_name.split(".")[0])
|
|
||||||
|
|
||||||
tasks_in_category = []
|
|
||||||
for category_name in category_names:
|
|
||||||
for task_name in task_names:
|
|
||||||
for subname in subcategories:
|
|
||||||
if subname in task_name:
|
|
||||||
if subcategories[subname][0] in categories[category_name]:
|
|
||||||
tasks_in_category.append(task_name)
|
|
||||||
break
|
|
||||||
|
|
||||||
tasks_all = [(folder_name, task_name) for task_name in tasks_in_category]
|
|
||||||
multiple_choice_tasks = set([item[1] for item in tasks_all])
|
|
||||||
boolean_tasks = set()
|
|
||||||
numerical_output_tasks = set()
|
|
||||||
|
|
||||||
# OR: filter by task
|
|
||||||
# tasks_all = [
|
|
||||||
# # ('test', 'abstract_algebra_test'),
|
|
||||||
# # ('test', 'college_computer_science_test'),
|
|
||||||
# # ('test', 'college_mathematics_test'),
|
|
||||||
# # ('test', 'college_physics_test'),
|
|
||||||
# # ('test', 'elementary_mathematics_test'),
|
|
||||||
# # ('test', 'global_facts_test'),
|
|
||||||
# # ('test', 'high_school_physics_test'),
|
|
||||||
# # ('test', 'machine_learning_test'),
|
|
||||||
# # ('test', 'management_test'),
|
|
||||||
# # ('test', 'medical_genetics_test'),
|
|
||||||
# # ('test', 'moral_scenarios_test'),
|
|
||||||
# # ('test', 'professional_psychology_test'),
|
|
||||||
# # ('test', 'public_relations_test'),
|
|
||||||
# # ('test', 'professional_law_test'),
|
|
||||||
# # ('test', 'high_school_psychology_test'),
|
|
||||||
# # ('test', 'high_school_world_history_test'),
|
|
||||||
# # ('test', 'human_aging_test'),
|
|
||||||
# # ('test', 'miscellaneous_test'),
|
|
||||||
# # ('test', 'moral_scenarios_test'),
|
|
||||||
# ('test', 'professional_psychology_test'),
|
|
||||||
# # ('test', 'security_studies_test'),
|
|
||||||
# ]
|
|
||||||
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
tasks_all = [task_name]
|
|
||||||
assert (
|
|
||||||
len(tasks_all) == 1
|
|
||||||
), "for now only support prompt optimization on one BBH task"
|
|
||||||
|
|
||||||
# all BBH tasks are as below
|
|
||||||
# tasks_all = [
|
|
||||||
# 'boolean_expressions',
|
|
||||||
# 'causal_judgement',
|
|
||||||
# 'date_understanding',
|
|
||||||
# 'disambiguation_qa',
|
|
||||||
# 'dyck_languages',
|
|
||||||
# 'formal_fallacies',
|
|
||||||
# 'geometric_shapes',
|
|
||||||
# 'hyperbaton',
|
|
||||||
# 'logical_deduction_five_objects',
|
|
||||||
# 'logical_deduction_seven_objects',
|
|
||||||
# 'logical_deduction_three_objects',
|
|
||||||
# 'movie_recommendation',
|
|
||||||
# 'multistep_arithmetic_two',
|
|
||||||
# 'navigate',
|
|
||||||
# 'object_counting',
|
|
||||||
# 'penguins_in_a_table',
|
|
||||||
# 'reasoning_about_colored_objects',
|
|
||||||
# 'ruin_names',
|
|
||||||
# 'salient_translation_error_detection',
|
|
||||||
# 'snarks',
|
|
||||||
# 'sports_understanding',
|
|
||||||
# 'temporal_sequences',
|
|
||||||
# 'tracking_shuffled_objects_five_objects',
|
|
||||||
# 'tracking_shuffled_objects_seven_objects',
|
|
||||||
# 'tracking_shuffled_objects_three_objects',
|
|
||||||
# 'web_of_lies',
|
|
||||||
# 'word_sorting'
|
|
||||||
# ]
|
|
||||||
numerical_output_tasks = {
|
|
||||||
"object_counting",
|
|
||||||
"multistep_arithmetic_two",
|
|
||||||
}
|
|
||||||
|
|
||||||
multiple_choice_tasks = {
|
|
||||||
"date_understanding",
|
|
||||||
"disambiguation_qa",
|
|
||||||
"geometric_shapes",
|
|
||||||
"hyperbaton",
|
|
||||||
"logical_deduction_five_objects",
|
|
||||||
"logical_deduction_seven_objects",
|
|
||||||
"logical_deduction_three_objects",
|
|
||||||
"movie_recommendation",
|
|
||||||
"penguins_in_a_table",
|
|
||||||
"reasoning_about_colored_objects",
|
|
||||||
"ruin_names",
|
|
||||||
"salient_translation_error_detection",
|
|
||||||
"snarks",
|
|
||||||
"temporal_sequences",
|
|
||||||
"tracking_shuffled_objects_five_objects",
|
|
||||||
"tracking_shuffled_objects_seven_objects",
|
|
||||||
"tracking_shuffled_objects_three_objects",
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean_tasks = {
|
|
||||||
"boolean_expressions", # True or False
|
|
||||||
"causal_judgement", # yes or no
|
|
||||||
"formal_fallacies", # valid or invalid
|
|
||||||
"navigate", # yes or no
|
|
||||||
"sports_understanding", # yes or no
|
|
||||||
"web_of_lies", # yes or no
|
|
||||||
}
|
|
||||||
|
|
||||||
else:
|
|
||||||
assert dataset_name in {"gsm8k"}
|
|
||||||
tasks_all = [task_name]
|
|
||||||
multiple_choice_tasks = set()
|
|
||||||
boolean_tasks = set()
|
|
||||||
numerical_output_tasks = set(tasks_all)
|
|
||||||
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
raw_data = pd.DataFrame()
|
|
||||||
prediction_treat_as_number = False
|
|
||||||
prediction_treat_as_bool = False
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
raw_data = []
|
|
||||||
prediction_treat_as_number = bool(
|
|
||||||
tasks_all[0] in numerical_output_tasks
|
|
||||||
) # for now only check the first task
|
|
||||||
prediction_treat_as_bool = bool(
|
|
||||||
tasks_all[0] in boolean_tasks
|
|
||||||
) # for now only check the first task
|
|
||||||
print(
|
|
||||||
f"prediction_treat_as_number: {prediction_treat_as_number},"
|
|
||||||
f" prediction_treat_as_bool: {prediction_treat_as_bool}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
assert dataset_name == "gsm8k"
|
|
||||||
raw_data = pd.DataFrame()
|
|
||||||
prediction_treat_as_number = True
|
|
||||||
prediction_treat_as_bool = False
|
|
||||||
|
|
||||||
for t in tasks_all:
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
folder_name = t[0]
|
|
||||||
task_name = t[1]
|
|
||||||
single_task_df = pd.read_csv(
|
|
||||||
os.path.join(root_data_folder_path, f"{folder_name}/{task_name}.csv"),
|
|
||||||
index_col=None,
|
|
||||||
header=None,
|
|
||||||
)
|
|
||||||
raw_data = pd.concat([raw_data, single_task_df])
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
task_name = t
|
|
||||||
single_task_list = opt_utils.load_bbh_task_data(
|
|
||||||
task_name, base_dir=root_data_folder_path
|
|
||||||
)
|
|
||||||
raw_data += single_task_list
|
|
||||||
else:
|
|
||||||
assert dataset_name == "gsm8k"
|
|
||||||
task_name = t
|
|
||||||
f_gsm = os.path.join(root_data_folder_path, f"gsm_{task_name}.tsv")
|
|
||||||
single_task_df = pd.read_csv(f_gsm, sep="\t", header=None)
|
|
||||||
raw_data = pd.concat([raw_data, single_task_df])
|
|
||||||
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
num_examples = raw_data.shape[0]
|
|
||||||
elif dataset_name == "bbh":
|
|
||||||
num_examples = len(raw_data)
|
|
||||||
else:
|
|
||||||
assert dataset_name in {"gsm8k"}
|
|
||||||
num_examples = raw_data.shape[0]
|
|
||||||
print(f"number of examples in the current task: {num_examples}")
|
|
||||||
|
|
||||||
# ================ split data into train/val/test ==========================
|
|
||||||
if dataset_name == "mmlu":
|
|
||||||
train_ratio = 0.8
|
|
||||||
eval_ratio = 0.2
|
|
||||||
elif dataset_name == "gsm8k":
|
|
||||||
train_ratio = 0.035
|
|
||||||
eval_ratio = 0
|
|
||||||
else:
|
|
||||||
assert dataset_name == "bbh"
|
|
||||||
train_ratio = 0.2
|
|
||||||
eval_ratio = 0
|
|
||||||
|
|
||||||
# train-validation-test split
|
|
||||||
# It is important to sort the indices, as this ensures the is_multiple_choice
|
|
||||||
# Boolean variables match the data points.
|
|
||||||
assert train_ratio + eval_ratio <= 1
|
|
||||||
test_ratio = 1 - train_ratio - eval_ratio
|
|
||||||
print(
|
|
||||||
f"train_ratio: {train_ratio}, eval_ratio: {eval_ratio}, "
|
|
||||||
f"test_ratio: {test_ratio}"
|
|
||||||
)
|
|
||||||
np.random.seed(0)
|
|
||||||
train_index = np.sort(
|
|
||||||
np.array(
|
|
||||||
np.random.choice(
|
|
||||||
num_examples, size=int(train_ratio * num_examples), replace=False
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
eval_and_test_index = np.sort(
|
|
||||||
np.array(list(set(np.arange(num_examples)) - set(train_index)))
|
|
||||||
)
|
|
||||||
eval_index = np.sort(
|
|
||||||
np.array(
|
|
||||||
np.random.choice(
|
|
||||||
eval_and_test_index,
|
|
||||||
size=int(eval_ratio * num_examples),
|
|
||||||
replace=False,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# ========== set other optimization experiment hyperparameters ==============
|
|
||||||
if scorer_llm_name == "text-bison":
|
|
||||||
old_instruction_score_threshold = 0.0 # 完全保留旧指令 表示不过滤任何历史指令(即使质量很低的旧指令也会保留)。
|
|
||||||
# old_instruction_score_threshold = 0.15 # for GSM8K
|
|
||||||
elif scorer_llm_name == "local":
|
|
||||||
old_instruction_score_threshold = 0.3
|
|
||||||
else:
|
|
||||||
assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"} # 模型校验
|
|
||||||
old_instruction_score_threshold = 0.3 # 过滤低质量旧指令
|
|
||||||
|
|
||||||
if scorer_llm_name == "text-bison":
|
|
||||||
extract_final_answer_by_prompting_again = False # 是否通过二次提示提取最终答案(例如从冗长响应中提取关键内容)
|
|
||||||
include_qa = False # 是否在元提示中包含问答对
|
|
||||||
evaluate_in_parallel = False # 是否并行评估
|
|
||||||
elif scorer_llm_name == "local":
|
|
||||||
extract_final_answer_by_prompting_again = True
|
|
||||||
include_qa = True
|
|
||||||
evaluate_in_parallel = True
|
|
||||||
else:
|
|
||||||
assert scorer_llm_name in {"gpt-3.5-turbo", "gpt-4"}
|
|
||||||
extract_final_answer_by_prompting_again = False
|
|
||||||
include_qa = False
|
|
||||||
evaluate_in_parallel = False
|
|
||||||
|
|
||||||
optimizer_llm_temperature = optimizer_llm_dict["temperature"]
|
|
||||||
|
|
||||||
num_few_shot_questions_for_instruction_refinement = 3 # number of few-shot questions 每次优化指令时参考的少样本示例数量(Few-shot QA对)。
|
|
||||||
|
|
||||||
# To change the number of generated instructions in each step, one should
|
|
||||||
# edit the value of the variable below, instead of editing the number of
|
|
||||||
# decodes in model parameters, because those values are limited by model
|
|
||||||
# serving configs.
|
|
||||||
num_generated_instructions_in_each_step = 3 # number of generated instructions in each step 每轮搜索生成的候选指令数量。
|
|
||||||
num_search_steps = 50 # number of search steps 总优化迭代次数。
|
|
||||||
|
|
||||||
initial_instructions = [
|
|
||||||
"Let's solve the problem.",
|
|
||||||
# "",
|
|
||||||
# "The answer is",
|
|
||||||
]
|
|
||||||
few_shot_qa_pairs = True #是否使用少样本示例指导指令生成。
|
|
||||||
# one of {'accumulative_most_frequent', 'current_most_frequent', 'random',
|
|
||||||
# 'constant'}
|
|
||||||
few_shot_selection_criteria = "random" #对多样性要求高时用 random,稳定性要求高时用 most_frequent。
|
|
||||||
# whether to evaluate generated instructions on the exemplars in meta-prompt
|
|
||||||
evaluate_generated_ins_on_few_shot = False # 是否评估新指令 开发阶段设为 True调试指令质量。
|
|
||||||
# whether to evaluate old instructions on the exemplars in the meta-prompt
|
|
||||||
evaluate_old_ins_on_few_shot = False # 是否评估旧指令 生产阶段设为 False加速运行。
|
|
||||||
# every this number of steps, compute the accuracies of current-step
|
|
||||||
# instructions on the validation set
|
|
||||||
eval_interval = 3 # 每N步在验证集上测试当前指令的准确率。
|
|
||||||
|
|
||||||
max_num_instructions = (
|
|
||||||
20 # 元提示中保留的历史指令数量上限。
|
|
||||||
)
|
|
||||||
# 将连续分数离散化为N档(如0-100整数),简化模型理解。
|
|
||||||
num_score_buckets = 100
|
|
||||||
# whether to put old instructions and scores to before exemplars in
|
|
||||||
# 控制元提示中历史指令和少样本示例的顺序。
|
|
||||||
meta_prompt_instructions_before_exemplars = True
|
|
||||||
|
|
||||||
# ===================== run prompt optimization ======================
|
|
||||||
|
|
||||||
assert few_shot_selection_criteria in {
|
|
||||||
"accumulative_most_frequent",
|
|
||||||
"current_most_frequent",
|
|
||||||
"random",
|
|
||||||
"constant",
|
|
||||||
}
|
|
||||||
evolution_kwargs = {
|
|
||||||
"num_search_steps": num_search_steps,
|
|
||||||
"old_instruction_score_threshold": old_instruction_score_threshold,
|
|
||||||
"scorer_llm_dict": scorer_llm_dict,
|
|
||||||
"optimizer_llm_dict": optimizer_llm_dict,
|
|
||||||
"extract_final_answer_by_prompting_again": (
|
|
||||||
extract_final_answer_by_prompting_again
|
|
||||||
),
|
|
||||||
"include_qa": include_qa,
|
|
||||||
"evaluate_in_parallel": evaluate_in_parallel,
|
|
||||||
"tasks_all": tasks_all,
|
|
||||||
"train_ratio": train_ratio,
|
|
||||||
"eval_ratio": eval_ratio,
|
|
||||||
"test_ratio": test_ratio,
|
|
||||||
"train_index": train_index,
|
|
||||||
"eval_index": eval_index,
|
|
||||||
"dataset_name": dataset_name,
|
|
||||||
"task_name": task_name,
|
|
||||||
"num_examples": num_examples,
|
|
||||||
"root_data_folder_path": root_data_folder_path,
|
|
||||||
"optimizer_llm_temperature": optimizer_llm_temperature,
|
|
||||||
# "optimizer_llm_temperature_schedule": (
|
|
||||||
# optimizer_llm_temperature_schedule
|
|
||||||
# ),
|
|
||||||
# "optimizer_llm_temperature_end": optimizer_llm_temperature_end,
|
|
||||||
"initial_instructions": initial_instructions,
|
|
||||||
"multiple_choice_tasks": multiple_choice_tasks,
|
|
||||||
"raw_data": raw_data,
|
|
||||||
"call_scorer_server_func": call_scorer_server_func,
|
|
||||||
"call_optimizer_server_func": call_optimizer_server_func,
|
|
||||||
"instruction_pos": instruction_pos,
|
|
||||||
"prediction_treat_as_number": prediction_treat_as_number,
|
|
||||||
"prediction_treat_as_bool": prediction_treat_as_bool,
|
|
||||||
"result_by_instruction_folder": result_by_instruction_folder,
|
|
||||||
"few_shot_qa_pairs": few_shot_qa_pairs,
|
|
||||||
"num_score_buckets": num_score_buckets,
|
|
||||||
"max_num_instructions": max_num_instructions,
|
|
||||||
"meta_prompt_type": meta_prompt_type,
|
|
||||||
"meta_prompt_instructions_before_exemplars": (
|
|
||||||
meta_prompt_instructions_before_exemplars
|
|
||||||
),
|
|
||||||
"few_shot_selection_criteria": few_shot_selection_criteria,
|
|
||||||
"optimizer_llm_name": optimizer_llm_name,
|
|
||||||
"num_generated_instructions_in_each_step": (
|
|
||||||
num_generated_instructions_in_each_step
|
|
||||||
),
|
|
||||||
"evaluate_generated_ins_on_few_shot": evaluate_generated_ins_on_few_shot,
|
|
||||||
"num_few_shot_questions_for_instruction_refinement": (
|
|
||||||
num_few_shot_questions_for_instruction_refinement
|
|
||||||
),
|
|
||||||
"evaluate_old_ins_on_few_shot": evaluate_old_ins_on_few_shot,
|
|
||||||
"eval_interval": eval_interval,
|
|
||||||
"save_folder": save_folder,
|
|
||||||
}
|
|
||||||
|
|
||||||
opt_utils.run_evolution(**evolution_kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
app.run(main)
|
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
{"session_id": "06e84e4b03c54df188961f4e033f9e77", "round": 1, "choice": "哪些奶茶含有较少的糖分?", "answer": "选择低糖奶茶时,可以通过品牌选择、糖分选项调整以及配料搭配来控制摄入的糖分。以下是一些具体建议和推荐:\n\n---\n\n### **一、低糖奶茶品牌及选项**\n1. **喜茶(HEYTEA)** \n - **推荐系列**:轻乳茶(如「轻乳茉莉奶绿」)、部分限定款(如「芝士奶盖茶」可选低糖)。 \n - **糖分控制**:默认糖度为50%,可选「三分糖」(约10%糖度)或「无糖」(使用代糖)。 \n - **注意**:奶盖本身含糖,建议选择「无奶盖」选项。\n\n2. **奈雪的茶** \n - **推荐系列**:鲜果茶(如「霸气橙子」)、「鲜果轻乳茶」系列。 \n - **糖分控制**:默认糖度为50%,可选「少糖」(约30%)或「无糖」(使用代糖)。 \n - **注意**:鲜果茶的天然果糖含量较高,需结合糖分选项综合判断。\n\n3. **CoCo都可** \n - **推荐系列**:「鲜果茶」(如「百香果鲜果茶」)、「轻乳茶」。 \n - **糖分控制**:默认全糖(约100%),可选「三分糖」(约30%)或「无糖」(代糖)。 \n - **注意**:部分经典奶茶(如「珍珠奶茶」)糖分较高,建议选择鲜果类饮品。\n\n4. **一点点** \n - **推荐系列**:「鲜果茶」(如「百香果鲜果茶」)、「轻乳茶」。 \n - **糖分控制**:默认全糖(约100%),可选「半糖」(50%)或「无糖」(代糖)。 \n - **注意**:部分奶盖茶含糖量较高,建议选择鲜果或轻乳系列。\n\n---\n\n### **二、如何选择低糖奶茶?**\n1. **糖分选项优先级** \n - **无糖/代糖**:使用赤藓糖醇、甜菊糖等代糖,几乎不含热量(但需注意部分人对代糖敏感)。 \n - **三分糖/半糖**:糖分约为全糖的30%-50%,适合控制摄入。 \n - **避免全糖**:全糖奶茶的糖分通常在"}
|
|
||||||
{"session_id": "af370656137a4b8f97d8ffac2c60b191", "round": 1, "choice": "瑞幸咖啡有哪些值得推荐的饮品?", "answer": "瑞幸咖啡近年来凭借高性价比和创新口味,推出了许多受欢迎的饮品。以下是一些值得推荐的饮品,涵盖经典款、特色新品和季节限定,适合不同口味和需求:\n\n---\n\n### **一、爆款必试系列**\n1. **生椰拿铁** \n - **推荐理由**:瑞幸的“现象级”产品,椰香浓郁与咖啡的苦香完美融合,口感顺滑,常年占据销量榜首。 \n - **小贴士**:可尝试“生椰拿铁+巧克力糖浆”组合,变成“生椰摩卡”,风味更丰富。\n\n2. **丝绒拿铁** \n - **推荐理由**:以红茶为基底,搭配丝滑牛奶,茶香与奶香平衡,适合喜欢茶咖融合的人群。 \n - **特色**:选用锡兰红茶,口感更醇厚,冷热皆宜。\n\n3. **厚乳拿铁** \n - **推荐理由**:使用厚乳(高乳脂含量的牛奶),奶香更浓郁,适合追求绵密口感的爱好者。\n\n---\n\n### **二、果味与创意系列**\n1. **冰椰拿铁**(夏季限定) \n - **推荐理由**:生椰拿铁的冰饮版本,加入冰块和椰香糖浆,清爽解暑,适合夏天。\n\n2. **蓝莓生椰拿铁** \n - **推荐理由**:在生椰拿铁基础上加入蓝莓糖浆,果香与椰香交织,甜而不腻。\n\n3. **蜜桃生椰拿铁** \n - **推荐理由**:蜜桃风味糖浆与生椰拿1:1搭配,清新果香与咖啡的苦香碰撞,适合喜欢果味的人。\n\n---\n\n### **三、季节限定款**\n1. **桂花拿铁**(秋季限定) \n - **推荐理由**:桂花糖浆与拿铁结合,香气扑鼻,甜度适中,是"}
|
|
||||||
Reference in New Issue
Block a user