Help Center/
ModelArts/
Best Practices/
LLM Training/
Adapting Mainstream Open-Source Models to AscendFactory NPU Training Based on Lite Server/
Training Service Configurations/
VeRL Data Processing Sample Script
Updated on 2025-11-04 GMT+08:00
VeRL Data Processing Sample Script
The VeRL framework includes sample data processing scripts for both large language models (LLMs) and multimodal models. Choose the appropriate script according to your model type.
LLM GSM8k Data Processing
import argparse
import os
import re
import datasets
from verl.utils.hdfs_io import copy, makedirs
def extract_solution(solution_str):
solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
assert solution is not None
final_solution = solution.group(0)
final_solution = final_solution.split("#### ")[1].replace(",", "")
return final_solution
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/gsm8k")
parser.add_argument("--hdfs_dir", default=None)
args = parser.parse_args()
data_source = "openai/gsm8k"
dataset = datasets.load_dataset(xxx/xxx/xxx,"main") # xxx/xxx/xxx is the dataset path.
train_dataset = dataset["train"]
test_dataset = dataset["test"]
instruction_following = 'Let\'s think step by step and output the final answer after "####".'
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
question_raw = example.pop("question")
question = question_raw + " " + instruction_following
answer_raw = example.pop("answer")
solution = extract_solution(answer_raw)
data = {
"data_source": data_source,
"prompt": [
{
"role": "user",
"content": question,
}
],
"ability": "math",
"reward_model": {"style": "rule", "ground_truth": solution},
"extra_info": {
"split": split,
"index": idx,
"answer": answer_raw,
"question": question_raw,
},
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
Multimodal Model Geometry3K Data Processing
import argparse
import os
import datasets
from verl.utils.hdfs_io import copy, makedirs
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/geo3k")
parser.add_argument("--hdfs_dir", default=None)
args = parser.parse_args()
data_source = "hiyouga/geometry3k"
dataset = datasets.load_dataset(xxx/xxx/xxx) # xxx/xxx/xxx is the dataset path.
train_dataset = dataset["train"]
test_dataset = dataset["test"]
instruction_following = (
r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. "
r"The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
)
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
problem = example.pop("problem")
prompt = problem + " " + instruction_following
answer = example.pop("answer")
images = example.pop("images")
data = {
"data_source": data_source,
"prompt": [
{
"role": "user",
"content": prompt,
}
],
"images": images,
"ability": "math",
"reward_model": {"style": "rule", "ground_truth": answer},
"extra_info": {
"split": split,
"index": idx,
"answer": answer,
"question": problem,
},
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
Parent topic: Training Service Configurations
Feedback
Was this page helpful?
Provide feedbackThank you very much for your feedback. We will continue working to improve the documentation.See the reply and handling status in My Cloud VOC.
The system is busy. Please try again later.
For any further questions, feel free to contact us through the chatbot.
Chatbot