Подготовка тренировочных данных
import json, pandas as pd
from openai import OpenAI
client = OpenAI()
recipe_df = pd.read_csv("data/cookbook_recipes_nlg_10k.csv")
system_msg = "You are a helpful recipe assistant. Extract generic ingredients."
def prepare_example(row):
return {
"messages": [
{"role": "system", "content": system_msg},
{"role": "user",
"content": f"Title: {row['title']}\n\nIngredients: {row['ingredients']}\n\nGeneric ingredients: "},
{"role": "assistant", "content": row["NER"]},
]
}
training_data = recipe_df.loc[0:100].apply(prepare_example, axis=1).tolist()
validation_data = recipe_df.loc[101:200].apply(prepare_example, axis=1).tolist()
def write_jsonl(data, path):
with open(path, "w") as f:
for d in data:
f.write(json.dumps(d) + "\n")
write_jsonl(training_data, "train.jsonl")
write_jsonl(validation_data, "val.jsonl")
Загрузка файлов и запуск job
def upload_file(path):
with open(path, "rb") as f:
return client.files.create(file=f, purpose="fine-tune").id
train_id = upload_file("train.jsonl")
val_id = upload_file("val.jsonl")
job = client.fine_tuning.jobs.create(
training_file=train_id,
validation_file=val_id,
model="gpt-4o-mini-2024-07-18",
suffix="recipe-ner",
)
print("Job ID:", job.id, "| Status:", job.status)
Мониторинг и инференс
import time
while True:
status = client.fine_tuning.jobs.retrieve(job.id).status
print(f"Status: {status}")
if status in ("succeeded", "failed", "cancelled"):
break
time.sleep(60)
ft_model = client.fine_tuning.jobs.retrieve(job.id).fine_tuned_model
# Инференс
resp = client.chat.completions.create(
model=ft_model,
messages=[
{"role": "system", "content": system_msg},
{"role": "user", "content": "Title: Pasta\nIngredients: [...]\nGeneric ingredients: "},
],
temperature=0,
max_tokens=200,
)
print(resp.choices[0].message.content)
Начинайте с 50–100 примеров. Перфоманс растёт линейно с объёмом датасета, поэтому итеративно увеличивайте размер, пока не достигнете целевой точности.