文档首页/
AI开发平台ModelArts/
最佳实践/
LLM大语言模型训练/
LLM大语言模型训练历史版本文档/
主流开源大模型基于ModelArts Standard&Lite Server适配AscendFactory PyTorch NPU训练指导(6.5.905)/
训练服务配置说明/
VeRL数据处理样例脚本
更新时间:2025-07-29 GMT+08:00
VeRL数据处理样例脚本
VeRL框架中的样例数据处理脚本分为大语言模型和多模态模型,样例脚本如下,根据模型类型选择:
大语言模型gsm8k数据处理
import argparse import os import re import datasets from verl.utils.hdfs_io import copy, makedirs def extract_solution(solution_str): solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) assert solution is not None final_solution = solution.group(0) final_solution = final_solution.split("#### ")[1].replace(",", "") return final_solution if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--local_dir", default="~/data/gsm8k") parser.add_argument("--hdfs_dir", default=None) args = parser.parse_args() data_source = "openai/gsm8k" dataset = datasets.load_dataset(xxx/xxx/xxx) train_dataset = dataset["train"] test_dataset = dataset["test"] instruction_following = 'Let\'s think step by step and output the final answer after "####".' # add a row to each data item that represents a unique id def make_map_fn(split): def process_fn(example, idx): question_raw = example.pop("question") question = question_raw + " " + instruction_following answer_raw = example.pop("answer") solution = extract_solution(answer_raw) data = { "data_source": data_source, "prompt": [ { "role": "user", "content": question, } ], "ability": "math", "reward_model": {"style": "rule", "ground_truth": solution}, "extra_info": { "split": split, "index": idx, "answer": answer_raw, "question": question_raw, }, } return data return process_fn train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) local_dir = args.local_dir hdfs_dir = args.hdfs_dir train_dataset.to_parquet(os.path.join(local_dir, "train.parquet")) test_dataset.to_parquet(os.path.join(local_dir, "test.parquet")) if hdfs_dir is not None: makedirs(hdfs_dir) copy(src=local_dir, dst=hdfs_dir)
多模态模型geometry3k数据处理
import argparse import os import datasets from verl.utils.hdfs_io import copy, makedirs if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--local_dir", default="~/data/geo3k") parser.add_argument("--hdfs_dir", default=None) args = parser.parse_args() data_source = "hiyouga/geometry3k" dataset = datasets.load_dataset(xxx/xxx/xxx) train_dataset = dataset["train"] test_dataset = dataset["test"] instruction_following = ( r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. " r"The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}." ) # add a row to each data item that represents a unique id def make_map_fn(split): def process_fn(example, idx): problem = example.pop("problem") prompt = problem + " " + instruction_following answer = example.pop("answer") images = example.pop("images") data = { "data_source": data_source, "prompt": [ { "role": "user", "content": prompt, } ], "images": images, "ability": "math", "reward_model": {"style": "rule", "ground_truth": answer}, "extra_info": { "split": split, "index": idx, "answer": answer, "question": problem, }, } return data return process_fn train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8) test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8) local_dir = args.local_dir hdfs_dir = args.hdfs_dir train_dataset.to_parquet(os.path.join(local_dir, "train.parquet")) test_dataset.to_parquet(os.path.join(local_dir, "test.parquet")) if hdfs_dir is not None: makedirs(hdfs_dir) copy(src=local_dir, dst=hdfs_dir)
父主题: 训练服务配置说明