-
Notifications
You must be signed in to change notification settings - Fork 73
Expand file tree
/
Copy pathgenerate.py
More file actions
241 lines (205 loc) · 8.51 KB
/
Copy pathgenerate.py
File metadata and controls
241 lines (205 loc) · 8.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import os
import json
import argparse
from typing import Optional, Tuple
from bigcodebench.provider import DecoderBase, make_model
from bigcodebench.data import get_bigcodebench, write_jsonl
from bigcodebench.sanitize import sanitize
from rich.progress import (
BarColumn,
MofNCompleteColumn,
Progress,
TextColumn,
TimeElapsedColumn,
)
def codegen(
model: DecoderBase,
target_path: str,
split: str,
subset: str,
greedy: bool = False,
strip_newlines: bool = False,
n_samples: int = 1,
id_range: Tuple[int, int] = None,
resume: bool = True,
batch_size: int = -1,
):
with Progress(
TextColumn(f"BigCodeBench--{split.capitalize()} ({subset.capitalize()}) •" + "[progress.percentage]{task.percentage:>3.0f}%"),
BarColumn(),
MofNCompleteColumn(),
TextColumn("•"),
TimeElapsedColumn(),
) as p:
dataset = get_bigcodebench(subset=subset)
if model.is_direct_completion() and split == "instruct":
raise Exception("Base model does not support direct completion for instruct tasks")
# create target_path if it doesn't exist, e.g., a/b.jsonl
dirname = os.path.dirname(target_path)
if not os.path.exists(dirname) and dirname != "":
os.makedirs(dirname)
batch_prompts = []
batch_task_ids = []
batch_nsamples = []
batch_entry_points = []
# Read existing data once if resuming
task2nexist = {}
if resume and os.path.exists(target_path):
with open(target_path, "r") as f:
for line in f:
item = json.loads(line)
task2nexist[item["task_id"]] = task2nexist.get(item["task_id"], 0) + 1
for id_num, (task_id, task) in enumerate(p.track(dataset.items())):
if id_range is not None:
low, high = id_range
if id_num < low:
p.console.print(f"Skipping {task_id} as it is not in {id_range}")
continue
if id_num >= id_range[1]:
break
p_name = task_id.replace("/", "_")
n_existing = task2nexist.get(task_id, 0)
nsamples = n_samples - n_existing
try:
prompt = task[f"{split}_prompt"]
except:
raise Exception(f"Invalid split {split} for bigcodebench-{subset}")
if strip_newlines:
prompt = prompt.strip("\n")
if nsamples > 0:
batch_prompts.append(prompt)
batch_task_ids.append(task_id)
batch_nsamples.append(nsamples)
batch_entry_points.append(task["entry_point"])
log = f"Codegen: {p_name} @ {model}"
if n_existing > 0:
log += f" (resuming from {n_existing})"
p.console.print(log)
if (batch_size and len(batch_prompts) == batch_size) or id_num == len(dataset) - 1 or (id_range and id_num == id_range[1] - 1):
if not batch_prompts and (id_num == len(dataset) - 1 or (id_range and id_num == id_range[1] - 1)):
break
outputs = model.codegen(
batch_prompts,
do_sample=not greedy,
num_samples=max(batch_nsamples),
)
assert outputs, "No outputs from model!"
samples = []
for task_id, content, entry_point, nsamples, task_outputs in zip(batch_task_ids, batch_prompts, batch_entry_points, batch_nsamples, outputs):
if model.is_direct_completion():
samples.extend([
dict(task_id=task_id, solution=sanitize(content+completion, entry_point), raw_solution=content+completion)
for completion in task_outputs[:nsamples]
])
else:
samples.extend([
dict(task_id=task_id, solution=sanitize(completion, entry_point), raw_solution=completion)
for completion in task_outputs[:nsamples]
])
print(f"Generated {len(samples)} samples")
write_jsonl(target_path, samples, append=True)
# Clear batches
batch_prompts = []
batch_task_ids = []
batch_nsamples = []
def run_codegen(
model: str,
split: str,
subset: str,
root: str = "bcb_results",
lora_path: str = None,
bs: Optional[int] = None,
n_samples: int = 1,
temperature: float = 0.0,
max_new_tokens: int = 1280,
# vllm
max_model_len: int = 12800,
greedy: bool = False,
# openai
reasoning_effort: str = "medium",
# anthropic
reasoning_budget: int = 0,
reasoning_beta: str = "output-128k-2025-02-19",
strip_newlines: bool = False,
direct_completion: bool = False,
resume: bool = True,
id_range: str = None,
backend: str = "vllm",
base_url: str = None,
tp: int = 1,
instruction_prefix: str = "Please provide a self-contained Python script that solves the following problem in a markdown code block:",
response_prefix: str ="Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:",
skip_prefill: bool = False,
revision: str = "main",
trust_remote_code: bool = False,
tokenizer_name: str = None,
tokenizer_legacy: bool = False,
):
if greedy or (temperature == 0 and n_samples == 1):
temperature = 0
n_samples = 1
greedy = True
print("Greedy decoding ON (--greedy): setting n_samples=1, temperature=0")
if id_range is not None:
id_range = [int(i) for i in id_range.split("-")]
assert len(id_range) == 2, "id_range must be a list of length 2"
assert id_range[0] < id_range[1], "id_range must be increasing"
id_range = tuple(id_range)
# Make project dir
os.makedirs(root, exist_ok=True)
# Make dir for codes generated by each model
model_runner = make_model(
model=model,
backend=backend,
subset=subset,
split=split,
lora_path=lora_path,
temperature=temperature,
max_new_tokens=max_new_tokens,
max_model_len=max_model_len,
reasoning_effort=reasoning_effort,
reasoning_budget=reasoning_budget,
reasoning_beta=reasoning_beta,
instruction_prefix=instruction_prefix,
response_prefix=response_prefix,
prefill=not skip_prefill,
base_url=base_url,
tp=tp,
revision=revision,
trust_remote_code=trust_remote_code,
direct_completion=direct_completion,
tokenizer_name=tokenizer_name,
tokenizer_legacy=tokenizer_legacy
)
extra = "-" + subset if subset != "full" else ""
if backend == "openai" and reasoning_effort and any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]):
model = model + f"--{reasoning_effort}"
if lora_path:
model = model + f"--lora-{lora_path}"
if backend == "anthropic" and reasoning_budget and reasoning_beta:
model = model + f"--{reasoning_budget}-{reasoning_beta}"
if skip_prefill:
identifier = model.replace("/", "--") + "--skip_prefill" + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl"
else:
identifier = model.replace("/", "--") + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl"
target_path = os.path.join(root, identifier)
if not resume:
os.remove(target_path)
codegen(
model=model_runner,
target_path=target_path,
split=split,
subset=subset,
greedy=greedy,
strip_newlines=strip_newlines,
n_samples=n_samples,
resume=resume,
id_range=id_range,
batch_size=bs
)
return target_path
def main():
from fire import Fire
Fire(run_codegen)
if __name__ == "__main__":
main()