Configuration

Config
type	object
properties
train	#/$defs/TrainingConfig
rollout	#/$defs/RolloutConfig
policy	#/$defs/PolicyConfig
logging	#/$defs/LoggingConfig
profiler	#/$defs/ProfilerConfig
validation	#/$defs/ValidationConfig
$defs
CheckpointConfig	CheckpointConfig
	type	object
	properties
	enable_checkpoint	Enable Checkpoint
		Enable checkpointing for training. If set to False, no checkpoint will be saved.
		type	boolean
		default	False
	save_freq	Save Freq
		Checkpoint save frequency for training steps
		type	integer
		default	20
	save_mode	Save Mode
		Checkpoint save mode for training steps
		type	string
		default	async
	max_keep	Max Keep
		Maximum number of checkpoints to keep. If set to -1, all checkpoints will be kept.
		type	integer
		default	5
	export_safetensors	Export Safetensors
		Whether to export a safetensors weight for huggingface usage, include related config files.
		type	boolean
		default	True
	upload_hf	Upload Hf
		Whether to upload the safetensors weight to huggingface.
		type	boolean
		default	False
	hf_repo_name	Hf Repo Name
		The huggingface repo name to upload the safetensors weight.
		type	string
		default	Comos-Reason1
	upload_s3	Upload S3
		Whether to upload the checkpoint and safetensors to S3. Default to False, set final will upload the final checkpoint, all will upload all checkpoints.
		default	False
		anyOf	type	boolean
			type	string
	s3_bucket	S3 Bucket
		The S3 bucket name to upload the checkpoint and safetensors weight.
		default	None
		anyOf	type	string
			type	null
	s3_prefix	S3 Prefix
		The S3 prefix to upload the checkpoint and safetensors weight.
		type	string
		default	outputs
DatasetConfig	DatasetConfig
	type	object
	properties
	name	Name
		Huggingface dataset name or local path to parquet file
		type	string
		default
	subset	Subset
		Dataset subset if exists
		default
		anyOf	type	string
			type	null
	revision	Revision
		OrderedDict({‘help’: ‘Dataset git revision if exist, can be a branch name, a tag, or a commit hash.’})
		default
		anyOf	type	string
			type	null
	split	Split
		A list of dataset splits to train
		default
		anyOf	type	string
			type	array
			items	type	string
	test_size	Test Size
		Size of the test set. If float, it is the ratio (between 0.0 and 1.0) of the dataset; if int, it is the absolute size of the test set.
		default	None
		anyOf	type	number
			type	integer
			type	null
FP8Config	FP8Config
	type	object
	properties
	enable_fp8	Enable Fp8
		Whether to enable fp8.
		type	boolean
		default	False
	fp8_recipe	Fp8 Recipe
		Recipe for weight scale calculation.
		type	string
		default	dynamic_scaling
	quant_recipe	Quant Recipe
		Quantization strategy for weight.
		type	string
		default	rowwise
GrpoConfig	GrpoConfig
	type	object
	properties
	type	Type
		type	string
		const	grpo
	variant	Variant
		Variant of the GRPO, currently support grpo, and dapo
		type	string
		default	grpo
	dataset	Dataset configuration for GRPO training. It includes dataset name, subset, revision, train split, test split and test size.
		#/$defs/DatasetConfig
	dataloader_shuffle	Dataloader Shuffle
		Shuffle the dataloader. If False, the dataloader will be used in the order it is loaded.
		type	boolean
		default	True
	enable_dataset_cache	Enable Dataset Cache
		Enable dataset cache process results, maybe accelerate the dataset loading
		type	boolean
		default	False
	dataloader_num_workers	Dataloader Num Workers
		Number of subprocess to use for data loading
		type	integer
		default	0
	dataloader_prefetch_factor	Dataloader Prefetch Factor
		Number of batches loaded in advance by each worker.
		default	None
		anyOf	type	integer
			type	null
	prompt_column_name	Prompt Column Name
		Column name for prompt
		type	string
		default
	response_column_name	Response Column Name
		Column name for response/reference answer
		type	string
		default
	reward_function	Reward Function
		A List of reward functions for the model. Currently support single_choice, boxed_math, and format.
		anyOf	type	string
			type	array
			items	type	string
	temperature	Temperature
		Temperature for sampling. The higher the temperature, the more random the completions.
		type	number
		default	1.0
	epsilon_low	Epsilon Low
		Epsilon value for clipping.
		type	number
		default	0.2
	epsilon_high	Epsilon High
		Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound specified in argument epsilon. Paper DAPO recommends 0.28.
		type	number
		default	0.2
	lower_bound_ratio	Lower Bound Ratio
		Lower-bound ratio for dual-clip.
		type	number
		default	3.0
	loss_type	Loss Type
		The type of loss to use for GRPO training.
		type	string
		default	token-mean
	unbiased_loss_max_tokens	Unbiased Loss Max Tokens
		Maximum number of tokens to use for unbiased loss introduced in Dr.GRPO. If set to None, will not use unbiased loss.Only available when loss_type is seq-mean-token-mean
default		None
anyOf		type	integer
		type	null
unbiased_advantage	Unbiased Advantage
	Whether to divide the advantage by the standard deviation of rewards.
	type	boolean
	default	False
overlong_reward	Configuration for overlong reward penalty. If enabled, the output will be penalized for responses that are too long.
	#/$defs/OverlongRewardConfig
kl_beta	Kl Beta
	KL coefficient. If 0.0, the reference model is not loaded, reducing memory usage and improving training speed, but may be numerically unstable for long training runs.
	type	number
	default	0.0
aipo_rho	Aipo Rho
	Rho value for AIPO (Asynchronous Importance weighted Policy Optimization). The clipping constant of the importance sampling ratio, suggest [2,10]. reference: https://arxiv.org/pdf/2505.24034
	default	None
	anyOf	type	number
		type	null
mu_iterations	Mu Iterations
	Number of iterations per batch (denoted as μ in the algorithm).
	type	integer
	default	1
mini_batch	Mini Batch
	mini-batch size for GRPO training.
	type	integer
	default	2
allowed_outdated_steps	Allowed Outdated Steps
	Allowed outdated-async steps for rollout engine. If the number of left pending rollouts is larger than the allowed_outdated_steps * n_policy_replicas * train_batch_per_replica, then rollout engine traffic will be throttled.
	type	integer
	default	4
min_filter_prefix_tokens	Min Filter Prefix Tokens
	Minimum number of tokens to filter the prefix tokens for the rollouts inside the same group. If the number of tokens is larger than the min_filter_prefix_tokens, the rollouts with the same prefix but different rewards will be filtered out in loss calculation.
	default	None
	anyOf	type	integer
		type	null
LoggingConfig	LoggingConfig
	type	object
	properties
	logger	Logger
		List of loggers to use, e.g., [‘console’, ‘wandb’]
		type	array
		items	type	string
	project_name	Project Name
		Wandb project name for logging. If set, the training will be logged to this project.
		type	string
		default	cosmos_rl
	experiment_name	Experiment Name
		A short display name for this run. If not set, will use the output_dir as the experiment name.
		default	None
		anyOf	type	string
			type	null
OverlongRewardConfig	OverlongRewardConfig
	type	object
	properties
	enable_overlong_penalty	Enable Overlong Penalty
		Enable overlong penalty for the model. If set to True, the output will be penalized for responses that are too long.
		type	boolean
		default	False
	buffer_length	Buffer Length
		Length of the buffer for overlong penalty. If the response length exceeds this value, the output will be penalized.
		type	integer
		default	4096
	penalty_factor	Penalty Factor
		Penalty factor for overlong penalty. The penalty increases linearly with the length of the response exceeding the buffer length from 0 to the penalty_factor.
		type	number
		default	1.0
ParallelismConfig	ParallelismConfig
	type	object
	properties
	n_init_replicas	N Init Replicas
		Number of initial replicas to be created
		type	integer
		default	1
	tp_size	Tp Size
		Tensor parallelism size
		type	integer
		default	2
	cp_size	Cp Size
		Context parallelism size
		type	integer
		default	1
	dp_shard_size	Dp Shard Size
		Data Parallelism size in sharded mode
		type	integer
		default	-1
	pp_size	Pp Size
		Pipeline parallelism size
		type	integer
		default	1
	pp_dynamic_shape	Pp Dynamic Shape
		Pipeline parallelism dynamic shape
		type	boolean
		default	False
	pp_micro_batch_size	Pp Micro Batch Size
		Pipeline parallelism micro batch size, n_micro_batch = batch_size / pp_micro_batch_size, which must be divisible by pp stages
		type	integer
		default	1
	dp_replicate_size	Dp Replicate Size
		Data Parallelism size in replica mode. Only configurable in SFT type job, must be 1 in GRPO type job for dynamic scaling support purpose.
		type	integer
		default	1
PolicyConfig	PolicyConfig
	type	object
	properties
	parallelism	#/$defs/ParallelismConfig
	model_name_or_path	Model Name Or Path
		The model name or path, compatible with huggingface model name or local path
		type	string
		default	Qwen/Qwen2.5-VL-7B-Instruct
	model_max_length	Model Max Length
		The maximum length for training, longer than this will be ignored for training stability
		type	integer
		default	4096
	model_gradient_checkpointing	Model Gradient Checkpointing
		Whether to use gradient checkpointing
		type	boolean
		default	True
ProfilerConfig	ProfilerConfig
	type	object
	properties
	enable_profiler	Enable Profiler
		Enable profiler for training
		type	boolean
		default	False
	sub_profiler_config	Sub profiler config
		#/$defs/SubProfilerConfig
RolloutConfig	RolloutConfig
	type	object
	properties
	parallelism	#/$defs/RolloutParallelismConfig
	enforce_eager	Enforce Eager
		Whether to enable eager execution for vLLM.
		type	boolean
		default	True
	include_stop_str_in_output	Include Stop Str In Output
		Whether to include stop string in output.
		type	boolean
		default	False
	gpu_memory_utilization	Gpu Memory Utilization
		GPU memory utilization factor for rollout backend.
		type	number
		default	0.8
	enable_chunked_prefill	Enable Chunked Prefill
		Whether to enable chunked prefill for vLLM.
		type	boolean
		default	False
	max_response_length	Max Response Length
		Max output length of rollout generation.
		type	integer
		default	2048
	n_generation	N Generation
		n parameter same like what in OpenAI chat API.
		type	integer
		default	16
	batch_size	Batch Size
		Batch size for rollout.
		type	integer
		default	1
	val_batch_size	Val Batch Size
		Batch size for rollout generation during validation.
		default	None
		anyOf	type	integer
			type	null
	quantization	Quantization
		Quantization in vllm rollout generation.
		type	string
		default	none
	seed	Seed
		random seed for rollout.
		default	None
		anyOf	type	integer
			type	null
	sampling_config	#/$defs/SamplingConfig
	vllm_use_flashinfer	Vllm Use Flashinfer
		Use flashinfer for vllm rollout.
		type	boolean
		default	False
RolloutParallelismConfig	RolloutParallelismConfig
	type	object
	properties
	n_init_replicas	N Init Replicas
		Number of initial replicas to be created
		type	integer
		default	1
	tp_size	Tp Size
		Tensor parallelism size
		type	integer
		default	2
	cp_size	Cp Size
		Context parallelism size
		type	integer
		default	1
	dp_shard_size	Dp Shard Size
		Data Parallelism size in sharded mode
		type	integer
		default	-1
	pp_size	Pp Size
		Pipeline parallelism size
		type	integer
		default	1
	pp_dynamic_shape	Pp Dynamic Shape
		Pipeline parallelism dynamic shape
		type	boolean
		default	False
	pp_micro_batch_size	Pp Micro Batch Size
		Pipeline parallelism micro batch size, n_micro_batch = batch_size / pp_micro_batch_size, which must be divisible by pp stages
		type	integer
		default	1
	dp_replicate_size	Dp Replicate Size
		Data Parallelism size in replica mode, only 1 is supported for dynamic scaling purpose.
		type	integer
		default	1
SFTDataConfig	SFTDataConfig
	type	object
	properties
	type	Type
		type	string
		const	sft
	dataset	Dataset configuration for SFT training. It includes dataset name, subset, revision, train split, and test split.
		#/$defs/DatasetConfig
	dataloader_shuffle	Dataloader Shuffle
		Shuffle the dataloader. If False, the dataloader will be used in the order it is loaded.
		type	boolean
		default	False
	enable_dataset_cache	Enable Dataset Cache
		Enable dataset cache process results, maybe accelerate the dataset loading
		type	boolean
		default	False
	dataloader_num_workers	Dataloader Num Workers
		Number of subprocess to use for data loading
		type	integer
		default	0
	dataloader_prefetch_factor	Dataloader Prefetch Factor
		Number of batches loaded in advance by each worker.
		default	None
		anyOf	type	integer
			type	null
	conversation_column_name	Conversation Column Name
		Column name for formated conversation json
		type	string
		default	conversations
	system_prompt	System Prompt
		System prompt for the model, which will be prepended to the prompt
		type	string
		default
SamplingConfig	SamplingConfig
	type	object
	properties
	temperature	Temperature
		Temperature for sampling.
		type	number
		default	1.0
	top_p	Top P
		Top-p for sampling.
		type	number
		default	1.0
	top_k	Top K
		Top-k for sampling.
		type	integer
		default	-1
	repetition_penalty	Repetition Penalty
		Repetition penalty for sampling.
		type	number
		default	1.0
	use_flashinfer	Use Flashinfer
		Use flashinfer for sampling.
		type	boolean
		default	False
SubProfilerConfig	SubProfilerConfig
	type	object
	properties
	do_profile	Do Profile
		Whether to profile, only used in runtime.
		type	boolean
		default	False
	active_steps	Active Steps
		Number of active steps
		type	integer
		default	1
	rank_filter	Rank Filter
		Rank filter
		type	array
		items	type	integer
	record_shape	Record Shape
		Whether to record shape
		type	boolean
		default	False
	profile_memory	Profile Memory
		Whether to profile memory
		type	boolean
		default	False
	with_stack	With Stack
		Whether to profile stack
		type	boolean
		default	False
	with_modules	With Modules
		Whether to profile modules
		type	boolean
		default	False
TrainingConfig	TrainingConfig
	type	object
	properties
	train_policy	Train Policy
		default	type	grpo
			variant	grpo
			dataset	name
				revision
				split
				subset
				test_size	None
			dataloader_shuffle	True
			enable_dataset_cache	False
			dataloader_num_workers	0
			dataloader_prefetch_factor	None
			prompt_column_name
			response_column_name
			reward_function	single_choice
			temperature	1.0
			epsilon_low	0.2
			epsilon_high	0.2
			lower_bound_ratio	3.0
			loss_type	token-mean
			unbiased_loss_max_tokens	None
			unbiased_advantage	False
			overlong_reward	buffer_length	4096
				enable_overlong_penalty	False
				penalty_factor	1.0
			kl_beta	0.0
			aipo_rho	None
			mu_iterations	1
			mini_batch	2
			allowed_outdated_steps	4
			min_filter_prefix_tokens	None
		oneOf	#/$defs/SFTDataConfig
			#/$defs/GrpoConfig
	fp8	#/$defs/FP8Config
	ckpt	#/$defs/CheckpointConfig
	resume	Resume
		Resume training from a checkpoint. If True, will resume from the latest checkpoint of the output_dir. If a string, will resume from the specified checkpoint path.
		default	False
		anyOf	type	boolean
			type	string
	epoch	Epoch
		Number of epochs for training
		type	integer
		default	1
	output_dir	Output Dir
		Output directory
		type	string
		default	./outputs
	timestamp	Timestamp
		Timestamp for the output directory and wandb ID, if not set, will be generated automatically
		type	string
		default
	epsilon	Epsilon
		Epsilon for optimizer
		type	number
		default	1e-06
	optm_name	Optm Name
		Optimizer name
		type	string
		default	AdamW
	optm_lr	Optm Lr
Learning rate for optimizer, can be a float or a list of floats for multiple optimizers
default		1e-06
anyOf		type	number
		type	array
		items	type	number
optm_impl		Optm Impl
	Implementation type for optimizer. More info: https://pytorch.org/docs/stable/optim.html, can be a list of strings for multiple optimizers
	default	fused
	anyOf	type	string
		type	array
		items	type	string
optm_weight_decay	Optm Weight Decay
	Weight decay for optimizer
	type	number
	default	0.01
optm_betas	Optm Betas
	Betas for optimizer
	type	array
	default	0.9
		0.999
	maxItems	2
	minItems	2
optm_warmup_steps	Optm Warmup Steps
	Warmup steps for optimizer
	type	integer
	default	20
optm_grad_norm_clip	Optm Grad Norm Clip
	Gradient norm clip for optimizer
	type	number
	default	1.0
async_tp_enabled	Async Tp Enabled
	Whether to use async tensor parallelism
	type	boolean
	default	False
compile	Compile
	Whether to use torch.compile
	type	boolean
	default	True
param_dtype	Param Dtype
	The data type for parameters and activations
	type	string
	default	bfloat16
fsdp_reduce_dtype	Fsdp Reduce Dtype
	The data type for reduction in FSDP
	type	string
	default	float32
fsdp_offload	Fsdp Offload
	Whether to offload the model to CPU if using FSDP
	type	boolean
	default	False
fsdp_reshard_after_forward	Fsdp Reshard After Forward
	Reshard the param after forward pass in FSDP
	type	string
	default	default
train_batch_per_replica	Train Batch Per Replica
	The batch size for training per iteration in one replica, this is the local batch size for each gradient accumulation step
	type	integer
	default	8
enable_validation	Enable Validation
	Enable validation during training.
	type	boolean
	default	False
validation_step	Validation Step
	Validation frequency during training, in terms of training steps
	type	integer
	default	20
validation_batch_per_replica	Validation Batch Per Replica
	The batch size for validation per iteration in one replica.
	type	integer
	default	24
sync_weight_interval	Sync Weight Interval
	The interval of train step for synchronizing weights between replicas.
	type	integer
	default	1
ValidationConfig	ValidationConfig
	type	object
	properties
	dataset	Dataset configuration for validation. It includes dataset name, subset, revision and test split.
		#/$defs/DatasetConfig
	temperature	Temperature
		Temperature for sampling during validation.
		type	number
		default	0.9
	top_p	Top P
		Top-p for sampling during validation.
		type	number
		default	1.0
	top_k	Top K
		Top-k for sampling during validation.
		type	integer
		default	10
	repetition_penalty	Repetition Penalty
		Repetition penalty for sampling during validation.
		type	number
		default	1.0
	n_generation	N Generation
		n parameter same like what in OpenAI chat API for validation.
		type	integer
		default	1
	max_response_length	Max Response Length
		Max output length of rollout generation during validation.
		type	integer
		default	2048