Lang Feng commited on
add 'resources_per_worker' config for easily managing cpus/gpus of each env worker (#148)
Browse files- agent_system/environments/env_manager.py +13 -10
- agent_system/environments/env_package/alfworld/envs.py +5 -5
- agent_system/environments/env_package/appworld/envs.py +5 -2
- agent_system/environments/env_package/gym_cards/envs.py +7 -3
- agent_system/environments/env_package/sokoban/envs.py +5 -3
- agent_system/environments/env_package/webshop/envs.py +11 -9
- examples/dapo_trainer/run_alfworld.sh +3 -0
- examples/dapo_trainer/run_webshop.sh +3 -0
- examples/gigpo_dynamic_trainer/run_alfworld.sh +3 -0
- examples/gigpo_dynamic_trainer/run_sokoban.sh +3 -0
- examples/gigpo_dynamic_trainer/run_webshop.sh +3 -0
- examples/gigpo_trainer/run_alfworld.sh +3 -0
- examples/gigpo_trainer/run_alfworld_lora.sh +3 -0
- examples/gigpo_trainer/run_blackjack.sh +3 -0
- examples/gigpo_trainer/run_ezpoints.sh +3 -0
- examples/gigpo_trainer/run_numberline.sh +3 -0
- examples/gigpo_trainer/run_sokoban.sh +3 -0
- examples/gigpo_trainer/run_webshop.sh +3 -0
- examples/gigpo_trainer/run_webshop_lora.sh +3 -0
- examples/gigpo_trainer/run_webshop_qwen3.sh +3 -0
- examples/grpo_trainer/run_alfworld.sh +3 -0
- examples/grpo_trainer/run_balckjack.sh +3 -0
- examples/grpo_trainer/run_sokoban.sh +3 -0
- examples/grpo_trainer/run_webshop.sh +3 -0
- examples/ppo_trainer/run_alfworld.sh +3 -0
- examples/ppo_trainer/run_webshop.sh +3 -0
- verl/trainer/config/ppo_trainer.yaml +3 -0
agent_system/environments/env_manager.py
CHANGED
|
@@ -22,6 +22,7 @@ import os
|
|
| 22 |
from agent_system.environments.prompts import *
|
| 23 |
from agent_system.environments.base import EnvironmentManagerBase, to_numpy
|
| 24 |
from agent_system.memory import SimpleMemory
|
|
|
|
| 25 |
|
| 26 |
def parse_gamefile(infos):
|
| 27 |
gamefile = []
|
|
@@ -518,10 +519,12 @@ def make_envs(config):
|
|
| 518 |
if not isinstance(config.env.rollout.n, int):
|
| 519 |
raise ValueError("config.env.rollout.n should be an integer")
|
| 520 |
group_n = config.env.rollout.n if config.env.rollout.n > 0 else 1
|
|
|
|
|
|
|
| 521 |
if "gym_cards" in config.env.env_name.lower():
|
| 522 |
from agent_system.environments.env_package.gym_cards import build_gymcards_envs, gym_projection
|
| 523 |
-
_envs = build_gymcards_envs(env_name=config.env.env_name, seed=config.env.seed, env_num=config.data.train_batch_size, group_n=group_n, is_train=True)
|
| 524 |
-
_val_envs = build_gymcards_envs(env_name=config.env.env_name, seed=config.env.seed + 1000, env_num=config.data.val_batch_size, group_n=1, is_train=False)
|
| 525 |
|
| 526 |
projection_f = partial(gym_projection, env_name=config.env.env_name)
|
| 527 |
envs = GymCardEnvironmentManager(_envs, projection_f, config)
|
|
@@ -539,8 +542,8 @@ def make_envs(config):
|
|
| 539 |
env_kwargs = {
|
| 540 |
'eval_dataset': 'eval_in_distribution', # 'eval_in_distribution' or 'eval_out_of_distribution'
|
| 541 |
}
|
| 542 |
-
_envs = build_alfworld_envs(alf_config_path, config.env.seed, config.data.train_batch_size, group_n, is_train=True, env_kwargs=env_kwargs)
|
| 543 |
-
_val_envs = build_alfworld_envs(alf_config_path, config.env.seed + 1000, config.data.val_batch_size, 1, is_train=False, env_kwargs=env_kwargs)
|
| 544 |
|
| 545 |
projection_f = partial(alfworld_projection)
|
| 546 |
envs = AlfWorldEnvironmentManager(_envs, projection_f, config)
|
|
@@ -554,8 +557,8 @@ def make_envs(config):
|
|
| 554 |
'max_steps': config.env.max_steps,
|
| 555 |
'search_depth': config.env.sokoban.search_depth
|
| 556 |
}
|
| 557 |
-
_envs = build_sokoban_envs(config.env.seed, config.data.train_batch_size, group_n, mode=config.env.sokoban.mode, is_train=True, env_kwargs=env_kwargs)
|
| 558 |
-
_val_envs = build_sokoban_envs(config.env.seed + 1000, config.data.val_batch_size, 1, mode=config.env.sokoban.mode, is_train=False, env_kwargs=env_kwargs)
|
| 559 |
|
| 560 |
projection_f = partial(sokoban_projection)
|
| 561 |
envs = SokobanEnvironmentManager(_envs, projection_f, config)
|
|
@@ -576,8 +579,8 @@ def make_envs(config):
|
|
| 576 |
'file_path': file_path,
|
| 577 |
'attr_path': attr_path
|
| 578 |
}
|
| 579 |
-
_envs = build_webshop_envs(seed=config.env.seed, env_num=config.data.train_batch_size, group_n=group_n, is_train=True, env_kwargs=env_kwargs)
|
| 580 |
-
_val_envs = build_webshop_envs(seed=config.env.seed + 1000, env_num=config.data.val_batch_size, group_n=1, is_train=False, env_kwargs=env_kwargs)
|
| 581 |
|
| 582 |
projection_f = partial(webshop_projection)
|
| 583 |
envs = WebshopEnvironmentManager(_envs, projection_f, config)
|
|
@@ -587,8 +590,8 @@ def make_envs(config):
|
|
| 587 |
return envs, val_envs
|
| 588 |
elif "appworld" in config.env.env_name.lower():
|
| 589 |
from agent_system.environments.env_package.appworld import build_appworld_envs, appworld_projection
|
| 590 |
-
_envs = build_appworld_envs(dataset_name='train', seed=config.env.seed, env_num=config.data.train_batch_size, group_n=group_n, start_server_id=0)
|
| 591 |
-
_val_envs = build_appworld_envs(dataset_name='test_normal', seed=config.env.seed + 1000, env_num=config.data.val_batch_size, group_n=1, start_server_id=config.data.train_batch_size*group_n)
|
| 592 |
|
| 593 |
projection_f = partial(appworld_projection)
|
| 594 |
envs = AppWorldEnvironmentManager(_envs, projection_f, config)
|
|
|
|
| 22 |
from agent_system.environments.prompts import *
|
| 23 |
from agent_system.environments.base import EnvironmentManagerBase, to_numpy
|
| 24 |
from agent_system.memory import SimpleMemory
|
| 25 |
+
from omegaconf import OmegaConf
|
| 26 |
|
| 27 |
def parse_gamefile(infos):
|
| 28 |
gamefile = []
|
|
|
|
| 519 |
if not isinstance(config.env.rollout.n, int):
|
| 520 |
raise ValueError("config.env.rollout.n should be an integer")
|
| 521 |
group_n = config.env.rollout.n if config.env.rollout.n > 0 else 1
|
| 522 |
+
resources_per_worker = OmegaConf.to_container(config.env.resources_per_worker, resolve=True)
|
| 523 |
+
|
| 524 |
if "gym_cards" in config.env.env_name.lower():
|
| 525 |
from agent_system.environments.env_package.gym_cards import build_gymcards_envs, gym_projection
|
| 526 |
+
_envs = build_gymcards_envs(env_name=config.env.env_name, seed=config.env.seed, env_num=config.data.train_batch_size, group_n=group_n, is_train=True, resources_per_worker=resources_per_worker)
|
| 527 |
+
_val_envs = build_gymcards_envs(env_name=config.env.env_name, seed=config.env.seed + 1000, env_num=config.data.val_batch_size, group_n=1, is_train=False, resources_per_worker=resources_per_worker)
|
| 528 |
|
| 529 |
projection_f = partial(gym_projection, env_name=config.env.env_name)
|
| 530 |
envs = GymCardEnvironmentManager(_envs, projection_f, config)
|
|
|
|
| 542 |
env_kwargs = {
|
| 543 |
'eval_dataset': 'eval_in_distribution', # 'eval_in_distribution' or 'eval_out_of_distribution'
|
| 544 |
}
|
| 545 |
+
_envs = build_alfworld_envs(alf_config_path, config.env.seed, config.data.train_batch_size, group_n, is_train=True, env_kwargs=env_kwargs, resources_per_worker=resources_per_worker)
|
| 546 |
+
_val_envs = build_alfworld_envs(alf_config_path, config.env.seed + 1000, config.data.val_batch_size, 1, is_train=False, env_kwargs=env_kwargs, resources_per_worker=resources_per_worker)
|
| 547 |
|
| 548 |
projection_f = partial(alfworld_projection)
|
| 549 |
envs = AlfWorldEnvironmentManager(_envs, projection_f, config)
|
|
|
|
| 557 |
'max_steps': config.env.max_steps,
|
| 558 |
'search_depth': config.env.sokoban.search_depth
|
| 559 |
}
|
| 560 |
+
_envs = build_sokoban_envs(config.env.seed, config.data.train_batch_size, group_n, mode=config.env.sokoban.mode, is_train=True, env_kwargs=env_kwargs, resources_per_worker=resources_per_worker)
|
| 561 |
+
_val_envs = build_sokoban_envs(config.env.seed + 1000, config.data.val_batch_size, 1, mode=config.env.sokoban.mode, is_train=False, env_kwargs=env_kwargs, resources_per_worker=resources_per_worker)
|
| 562 |
|
| 563 |
projection_f = partial(sokoban_projection)
|
| 564 |
envs = SokobanEnvironmentManager(_envs, projection_f, config)
|
|
|
|
| 579 |
'file_path': file_path,
|
| 580 |
'attr_path': attr_path
|
| 581 |
}
|
| 582 |
+
_envs = build_webshop_envs(seed=config.env.seed, env_num=config.data.train_batch_size, group_n=group_n, is_train=True, env_kwargs=env_kwargs, resources_per_worker=resources_per_worker)
|
| 583 |
+
_val_envs = build_webshop_envs(seed=config.env.seed + 1000, env_num=config.data.val_batch_size, group_n=1, is_train=False, env_kwargs=env_kwargs, resources_per_worker=resources_per_worker)
|
| 584 |
|
| 585 |
projection_f = partial(webshop_projection)
|
| 586 |
envs = WebshopEnvironmentManager(_envs, projection_f, config)
|
|
|
|
| 590 |
return envs, val_envs
|
| 591 |
elif "appworld" in config.env.env_name.lower():
|
| 592 |
from agent_system.environments.env_package.appworld import build_appworld_envs, appworld_projection
|
| 593 |
+
_envs = build_appworld_envs(dataset_name='train', seed=config.env.seed, env_num=config.data.train_batch_size, group_n=group_n, start_server_id=0, resources_per_worker=resources_per_worker)
|
| 594 |
+
_val_envs = build_appworld_envs(dataset_name='test_normal', seed=config.env.seed + 1000, env_num=config.data.val_batch_size, group_n=1, start_server_id=config.data.train_batch_size*group_n, resources_per_worker=resources_per_worker)
|
| 595 |
|
| 596 |
projection_f = partial(appworld_projection)
|
| 597 |
envs = AppWorldEnvironmentManager(_envs, projection_f, config)
|
agent_system/environments/env_package/alfworld/envs.py
CHANGED
|
@@ -52,7 +52,6 @@ def compute_reward(info, multi_modal=False):
|
|
| 52 |
reward = 10.0 * float(info['won'])
|
| 53 |
return reward
|
| 54 |
|
| 55 |
-
@ray.remote(num_cpus=0.2)
|
| 56 |
class AlfworldWorker:
|
| 57 |
"""
|
| 58 |
Ray remote actor that replaces the worker function.
|
|
@@ -84,7 +83,7 @@ class AlfworldWorker:
|
|
| 84 |
return image
|
| 85 |
|
| 86 |
class AlfworldEnvs(gym.Env):
|
| 87 |
-
def __init__(self, alf_config_path, seed
|
| 88 |
super().__init__()
|
| 89 |
|
| 90 |
# Initialize Ray if not already initialized
|
|
@@ -100,9 +99,10 @@ class AlfworldEnvs(gym.Env):
|
|
| 100 |
self.group_n = group_n
|
| 101 |
|
| 102 |
# Create Ray remote actors instead of processes
|
|
|
|
| 103 |
self.workers = []
|
| 104 |
for i in range(self.num_processes):
|
| 105 |
-
worker =
|
| 106 |
self.workers.append(worker)
|
| 107 |
|
| 108 |
self.prev_admissible_commands = [None for _ in range(self.num_processes)]
|
|
@@ -202,5 +202,5 @@ class AlfworldEnvs(gym.Env):
|
|
| 202 |
for worker in self.workers:
|
| 203 |
ray.kill(worker)
|
| 204 |
|
| 205 |
-
def build_alfworld_envs(alf_config_path, seed, env_num, group_n, is_train=True, env_kwargs={}):
|
| 206 |
-
return AlfworldEnvs(alf_config_path, seed, env_num, group_n, is_train, env_kwargs)
|
|
|
|
| 52 |
reward = 10.0 * float(info['won'])
|
| 53 |
return reward
|
| 54 |
|
|
|
|
| 55 |
class AlfworldWorker:
|
| 56 |
"""
|
| 57 |
Ray remote actor that replaces the worker function.
|
|
|
|
| 83 |
return image
|
| 84 |
|
| 85 |
class AlfworldEnvs(gym.Env):
|
| 86 |
+
def __init__(self, alf_config_path, seed, env_num, group_n, resources_per_worker, is_train=True, env_kwargs={}):
|
| 87 |
super().__init__()
|
| 88 |
|
| 89 |
# Initialize Ray if not already initialized
|
|
|
|
| 99 |
self.group_n = group_n
|
| 100 |
|
| 101 |
# Create Ray remote actors instead of processes
|
| 102 |
+
env_worker = ray.remote(**resources_per_worker)(AlfworldWorker)
|
| 103 |
self.workers = []
|
| 104 |
for i in range(self.num_processes):
|
| 105 |
+
worker = env_worker.remote(config, seed + (i // self.group_n), base_env)
|
| 106 |
self.workers.append(worker)
|
| 107 |
|
| 108 |
self.prev_admissible_commands = [None for _ in range(self.num_processes)]
|
|
|
|
| 202 |
for worker in self.workers:
|
| 203 |
ray.kill(worker)
|
| 204 |
|
| 205 |
+
def build_alfworld_envs(alf_config_path, seed, env_num, group_n, resources_per_worker, is_train=True, env_kwargs={}):
|
| 206 |
+
return AlfworldEnvs(alf_config_path, seed, env_num, group_n, resources_per_worker, is_train, env_kwargs)
|
agent_system/environments/env_package/appworld/envs.py
CHANGED
|
@@ -39,7 +39,6 @@ def load_available_ports(port_file="appworld_ports.ports"):
|
|
| 39 |
|
| 40 |
return ports
|
| 41 |
|
| 42 |
-
@ray.remote(num_cpus=0.1)
|
| 43 |
class AppWorldWorker:
|
| 44 |
"""
|
| 45 |
Ray Actor that holds an instance of AppWorld and operates the environment
|
|
@@ -115,6 +114,7 @@ class AppWorldEnvs:
|
|
| 115 |
env_num,
|
| 116 |
group_n,
|
| 117 |
start_server_id,
|
|
|
|
| 118 |
port_file="appworld_ports.ports"
|
| 119 |
):
|
| 120 |
super().__init__()
|
|
@@ -145,10 +145,11 @@ class AppWorldEnvs:
|
|
| 145 |
ray.init()
|
| 146 |
|
| 147 |
# Create Ray actors (workers)
|
|
|
|
| 148 |
self.workers = []
|
| 149 |
for i in range(self.num_processes):
|
| 150 |
port = self.available_ports[i]
|
| 151 |
-
worker =
|
| 152 |
worker_id=start_server_id + i,
|
| 153 |
max_interactions=self.max_interactions,
|
| 154 |
port=port
|
|
@@ -240,6 +241,7 @@ def build_appworld_envs(dataset_name="train",
|
|
| 240 |
env_num=1,
|
| 241 |
group_n=1,
|
| 242 |
start_server_id=0,
|
|
|
|
| 243 |
):
|
| 244 |
|
| 245 |
return AppWorldEnvs(
|
|
@@ -249,4 +251,5 @@ def build_appworld_envs(dataset_name="train",
|
|
| 249 |
env_num=env_num,
|
| 250 |
group_n=group_n,
|
| 251 |
start_server_id=start_server_id,
|
|
|
|
| 252 |
)
|
|
|
|
| 39 |
|
| 40 |
return ports
|
| 41 |
|
|
|
|
| 42 |
class AppWorldWorker:
|
| 43 |
"""
|
| 44 |
Ray Actor that holds an instance of AppWorld and operates the environment
|
|
|
|
| 114 |
env_num,
|
| 115 |
group_n,
|
| 116 |
start_server_id,
|
| 117 |
+
resources_per_worker,
|
| 118 |
port_file="appworld_ports.ports"
|
| 119 |
):
|
| 120 |
super().__init__()
|
|
|
|
| 145 |
ray.init()
|
| 146 |
|
| 147 |
# Create Ray actors (workers)
|
| 148 |
+
env_worker = ray.remote(**resources_per_worker)(AppWorldWorker)
|
| 149 |
self.workers = []
|
| 150 |
for i in range(self.num_processes):
|
| 151 |
port = self.available_ports[i]
|
| 152 |
+
worker = env_worker.remote(
|
| 153 |
worker_id=start_server_id + i,
|
| 154 |
max_interactions=self.max_interactions,
|
| 155 |
port=port
|
|
|
|
| 241 |
env_num=1,
|
| 242 |
group_n=1,
|
| 243 |
start_server_id=0,
|
| 244 |
+
resources_per_worker={"num_cpus": 0.1},
|
| 245 |
):
|
| 246 |
|
| 247 |
return AppWorldEnvs(
|
|
|
|
| 251 |
env_num=env_num,
|
| 252 |
group_n=group_n,
|
| 253 |
start_server_id=start_server_id,
|
| 254 |
+
resources_per_worker=resources_per_worker
|
| 255 |
)
|
agent_system/environments/env_package/gym_cards/envs.py
CHANGED
|
@@ -18,7 +18,7 @@ import ray
|
|
| 18 |
import numpy as np
|
| 19 |
from gym_cards.envs import Point24Env, EZPointEnv, BlackjackEnv, NumberLineEnv
|
| 20 |
|
| 21 |
-
|
| 22 |
class GymCardsWorker:
|
| 23 |
"""
|
| 24 |
Ray remote actor that replaces the worker function.
|
|
@@ -66,6 +66,7 @@ class GymMultiProcessEnv(gym.Env):
|
|
| 66 |
seed=0,
|
| 67 |
env_num=1,
|
| 68 |
group_n=1,
|
|
|
|
| 69 |
is_train=True):
|
| 70 |
super().__init__()
|
| 71 |
|
|
@@ -80,11 +81,12 @@ class GymMultiProcessEnv(gym.Env):
|
|
| 80 |
self.num_processes = env_num * group_n
|
| 81 |
|
| 82 |
np.random.seed(seed)
|
| 83 |
-
|
| 84 |
# Create Ray remote actors instead of processes
|
|
|
|
| 85 |
self.workers = []
|
| 86 |
for _ in range(self.num_processes):
|
| 87 |
-
worker =
|
| 88 |
self.workers.append(worker)
|
| 89 |
|
| 90 |
def step(self, actions):
|
|
@@ -162,6 +164,7 @@ def build_gymcards_envs(env_name,
|
|
| 162 |
seed,
|
| 163 |
env_num,
|
| 164 |
group_n,
|
|
|
|
| 165 |
is_train=True):
|
| 166 |
"""
|
| 167 |
Externally exposed constructor function to create parallel Gym environments.
|
|
@@ -176,5 +179,6 @@ def build_gymcards_envs(env_name,
|
|
| 176 |
seed=seed,
|
| 177 |
env_num=env_num,
|
| 178 |
group_n=group_n,
|
|
|
|
| 179 |
is_train=is_train,
|
| 180 |
)
|
|
|
|
| 18 |
import numpy as np
|
| 19 |
from gym_cards.envs import Point24Env, EZPointEnv, BlackjackEnv, NumberLineEnv
|
| 20 |
|
| 21 |
+
|
| 22 |
class GymCardsWorker:
|
| 23 |
"""
|
| 24 |
Ray remote actor that replaces the worker function.
|
|
|
|
| 66 |
seed=0,
|
| 67 |
env_num=1,
|
| 68 |
group_n=1,
|
| 69 |
+
resources_per_worker={"num_cpus": 0.1},
|
| 70 |
is_train=True):
|
| 71 |
super().__init__()
|
| 72 |
|
|
|
|
| 81 |
self.num_processes = env_num * group_n
|
| 82 |
|
| 83 |
np.random.seed(seed)
|
| 84 |
+
|
| 85 |
# Create Ray remote actors instead of processes
|
| 86 |
+
env_worker = ray.remote(**resources_per_worker)(GymCardsWorker)
|
| 87 |
self.workers = []
|
| 88 |
for _ in range(self.num_processes):
|
| 89 |
+
worker = env_worker.remote(self.env_id)
|
| 90 |
self.workers.append(worker)
|
| 91 |
|
| 92 |
def step(self, actions):
|
|
|
|
| 164 |
seed,
|
| 165 |
env_num,
|
| 166 |
group_n,
|
| 167 |
+
resources_per_worker,
|
| 168 |
is_train=True):
|
| 169 |
"""
|
| 170 |
Externally exposed constructor function to create parallel Gym environments.
|
|
|
|
| 179 |
seed=seed,
|
| 180 |
env_num=env_num,
|
| 181 |
group_n=group_n,
|
| 182 |
+
resources_per_worker=resources_per_worker,
|
| 183 |
is_train=is_train,
|
| 184 |
)
|
agent_system/environments/env_package/sokoban/envs.py
CHANGED
|
@@ -18,7 +18,6 @@ import gym
|
|
| 18 |
from agent_system.environments.env_package.sokoban.sokoban import SokobanEnv
|
| 19 |
import numpy as np
|
| 20 |
|
| 21 |
-
@ray.remote(num_cpus=0.2)
|
| 22 |
class SokobanWorker:
|
| 23 |
"""
|
| 24 |
Ray remote actor that replaces the worker function.
|
|
@@ -57,6 +56,7 @@ class SokobanMultiProcessEnv(gym.Env):
|
|
| 57 |
env_num=1,
|
| 58 |
group_n=1,
|
| 59 |
mode='rgb_array',
|
|
|
|
| 60 |
is_train=True,
|
| 61 |
env_kwargs=None):
|
| 62 |
"""
|
|
@@ -82,9 +82,10 @@ class SokobanMultiProcessEnv(gym.Env):
|
|
| 82 |
env_kwargs = {}
|
| 83 |
|
| 84 |
# Create Ray remote actors instead of processes
|
|
|
|
| 85 |
self.workers = []
|
| 86 |
for i in range(self.num_processes):
|
| 87 |
-
worker =
|
| 88 |
self.workers.append(worker)
|
| 89 |
|
| 90 |
def step(self, actions):
|
|
@@ -178,6 +179,7 @@ def build_sokoban_envs(
|
|
| 178 |
env_num=1,
|
| 179 |
group_n=1,
|
| 180 |
mode='rgb_array',
|
|
|
|
| 181 |
is_train=True,
|
| 182 |
env_kwargs=None):
|
| 183 |
-
return SokobanMultiProcessEnv(seed, env_num, group_n, mode, is_train, env_kwargs=env_kwargs)
|
|
|
|
| 18 |
from agent_system.environments.env_package.sokoban.sokoban import SokobanEnv
|
| 19 |
import numpy as np
|
| 20 |
|
|
|
|
| 21 |
class SokobanWorker:
|
| 22 |
"""
|
| 23 |
Ray remote actor that replaces the worker function.
|
|
|
|
| 56 |
env_num=1,
|
| 57 |
group_n=1,
|
| 58 |
mode='rgb_array',
|
| 59 |
+
resources_per_worker={"num_cpus": 0.1},
|
| 60 |
is_train=True,
|
| 61 |
env_kwargs=None):
|
| 62 |
"""
|
|
|
|
| 82 |
env_kwargs = {}
|
| 83 |
|
| 84 |
# Create Ray remote actors instead of processes
|
| 85 |
+
env_worker = ray.remote(**resources_per_worker)(SokobanWorker)
|
| 86 |
self.workers = []
|
| 87 |
for i in range(self.num_processes):
|
| 88 |
+
worker = env_worker.remote(self.mode, env_kwargs)
|
| 89 |
self.workers.append(worker)
|
| 90 |
|
| 91 |
def step(self, actions):
|
|
|
|
| 179 |
env_num=1,
|
| 180 |
group_n=1,
|
| 181 |
mode='rgb_array',
|
| 182 |
+
resources_per_worker={"num_cpus": 0.1},
|
| 183 |
is_train=True,
|
| 184 |
env_kwargs=None):
|
| 185 |
+
return SokobanMultiProcessEnv(seed, env_num, group_n, mode, resources_per_worker, is_train, env_kwargs=env_kwargs)
|
agent_system/environments/env_package/webshop/envs.py
CHANGED
|
@@ -21,7 +21,6 @@ import numpy as np
|
|
| 21 |
# Ray remote worker actor -----------------------------------------------------
|
| 22 |
# -----------------------------------------------------------------------------
|
| 23 |
|
| 24 |
-
@ray.remote(num_cpus=0.2)
|
| 25 |
class WebshopWorker:
|
| 26 |
"""Ray remote actor that replaces the worker function.
|
| 27 |
Each actor hosts a *WebAgentTextEnv* instance.
|
|
@@ -94,9 +93,10 @@ class WebshopMultiProcessEnv(gym.Env):
|
|
| 94 |
"""
|
| 95 |
def __init__(
|
| 96 |
self,
|
| 97 |
-
seed: int
|
| 98 |
-
env_num: int
|
| 99 |
-
group_n: int
|
|
|
|
| 100 |
is_train: bool = True,
|
| 101 |
env_kwargs: dict = None,
|
| 102 |
) -> None:
|
|
@@ -117,10 +117,10 @@ class WebshopMultiProcessEnv(gym.Env):
|
|
| 117 |
self._env_kwargs = env_kwargs if env_kwargs is not None else {'observation_mode': 'text', 'num_products': None}
|
| 118 |
|
| 119 |
# -------------------------- Ray actors setup --------------------------
|
|
|
|
| 120 |
self._workers = []
|
| 121 |
-
|
| 122 |
for i in range(self.num_processes):
|
| 123 |
-
worker =
|
| 124 |
self._workers.append(worker)
|
| 125 |
|
| 126 |
# Get goals from the first worker
|
|
@@ -239,9 +239,10 @@ class WebshopMultiProcessEnv(gym.Env):
|
|
| 239 |
# -----------------------------------------------------------------------------
|
| 240 |
|
| 241 |
def build_webshop_envs(
|
| 242 |
-
seed: int
|
| 243 |
-
env_num: int
|
| 244 |
-
group_n: int
|
|
|
|
| 245 |
is_train: bool = True,
|
| 246 |
env_kwargs: dict = None,
|
| 247 |
):
|
|
@@ -250,6 +251,7 @@ def build_webshop_envs(
|
|
| 250 |
seed=seed,
|
| 251 |
env_num=env_num,
|
| 252 |
group_n=group_n,
|
|
|
|
| 253 |
is_train=is_train,
|
| 254 |
env_kwargs=env_kwargs,
|
| 255 |
)
|
|
|
|
| 21 |
# Ray remote worker actor -----------------------------------------------------
|
| 22 |
# -----------------------------------------------------------------------------
|
| 23 |
|
|
|
|
| 24 |
class WebshopWorker:
|
| 25 |
"""Ray remote actor that replaces the worker function.
|
| 26 |
Each actor hosts a *WebAgentTextEnv* instance.
|
|
|
|
| 93 |
"""
|
| 94 |
def __init__(
|
| 95 |
self,
|
| 96 |
+
seed: int,
|
| 97 |
+
env_num: int,
|
| 98 |
+
group_n: int,
|
| 99 |
+
resources_per_worker: dict,
|
| 100 |
is_train: bool = True,
|
| 101 |
env_kwargs: dict = None,
|
| 102 |
) -> None:
|
|
|
|
| 117 |
self._env_kwargs = env_kwargs if env_kwargs is not None else {'observation_mode': 'text', 'num_products': None}
|
| 118 |
|
| 119 |
# -------------------------- Ray actors setup --------------------------
|
| 120 |
+
env_worker = ray.remote(**resources_per_worker)(WebshopWorker)
|
| 121 |
self._workers = []
|
|
|
|
| 122 |
for i in range(self.num_processes):
|
| 123 |
+
worker = env_worker.remote(seed + (i // self.group_n), self._env_kwargs)
|
| 124 |
self._workers.append(worker)
|
| 125 |
|
| 126 |
# Get goals from the first worker
|
|
|
|
| 239 |
# -----------------------------------------------------------------------------
|
| 240 |
|
| 241 |
def build_webshop_envs(
|
| 242 |
+
seed: int,
|
| 243 |
+
env_num: int,
|
| 244 |
+
group_n: int,
|
| 245 |
+
resources_per_worker: dict,
|
| 246 |
is_train: bool = True,
|
| 247 |
env_kwargs: dict = None,
|
| 248 |
):
|
|
|
|
| 251 |
seed=seed,
|
| 252 |
env_num=env_num,
|
| 253 |
group_n=group_n,
|
| 254 |
+
resources_per_worker=resources_per_worker,
|
| 255 |
is_train=is_train,
|
| 256 |
env_kwargs=env_kwargs,
|
| 257 |
)
|
examples/dapo_trainer/run_alfworld.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -60,6 +62,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 60 |
env.seed=0 \
|
| 61 |
env.max_steps=50 \
|
| 62 |
env.rollout.n=${group_size} \
|
|
|
|
| 63 |
trainer.critic_warmup=0 \
|
| 64 |
trainer.logger=['console','wandb'] \
|
| 65 |
trainer.project_name='verl_agent_alfworld' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 62 |
env.seed=0 \
|
| 63 |
env.max_steps=50 \
|
| 64 |
env.rollout.n=${group_size} \
|
| 65 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 66 |
trainer.critic_warmup=0 \
|
| 67 |
trainer.logger=['console','wandb'] \
|
| 68 |
trainer.project_name='verl_agent_alfworld' \
|
examples/dapo_trainer/run_webshop.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -60,6 +62,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 60 |
env.seed=0 \
|
| 61 |
env.max_steps=15 \
|
| 62 |
env.rollout.n=${group_size} \
|
|
|
|
| 63 |
trainer.critic_warmup=0 \
|
| 64 |
trainer.logger=['console','wandb'] \
|
| 65 |
trainer.project_name='verl_agent_webshop' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 62 |
env.seed=0 \
|
| 63 |
env.max_steps=15 \
|
| 64 |
env.rollout.n=${group_size} \
|
| 65 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 66 |
trainer.critic_warmup=0 \
|
| 67 |
trainer.logger=['console','wandb'] \
|
| 68 |
trainer.project_name='verl_agent_webshop' \
|
examples/gigpo_dynamic_trainer/run_alfworld.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -64,6 +66,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 64 |
env.seed=0 \
|
| 65 |
env.max_steps=50 \
|
| 66 |
env.rollout.n=$group_size \
|
|
|
|
| 67 |
trainer.critic_warmup=0 \
|
| 68 |
trainer.logger=['console','wandb'] \
|
| 69 |
trainer.project_name='verl_agent_alfworld' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 66 |
env.seed=0 \
|
| 67 |
env.max_steps=50 \
|
| 68 |
env.rollout.n=$group_size \
|
| 69 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 70 |
trainer.critic_warmup=0 \
|
| 71 |
trainer.logger=['console','wandb'] \
|
| 72 |
trainer.project_name='verl_agent_alfworld' \
|
examples/gigpo_dynamic_trainer/run_sokoban.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=32
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -66,6 +68,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 66 |
env.max_steps=15 \
|
| 67 |
env.rollout.n=$group_size \
|
| 68 |
env.sokoban.mode='rgb_array' \
|
|
|
|
| 69 |
trainer.critic_warmup=0 \
|
| 70 |
trainer.logger=['console','wandb'] \
|
| 71 |
trainer.project_name='verl_agent_sokoban' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=32
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 68 |
env.max_steps=15 \
|
| 69 |
env.rollout.n=$group_size \
|
| 70 |
env.sokoban.mode='rgb_array' \
|
| 71 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 72 |
trainer.critic_warmup=0 \
|
| 73 |
trainer.logger=['console','wandb'] \
|
| 74 |
trainer.project_name='verl_agent_sokoban' \
|
examples/gigpo_dynamic_trainer/run_webshop.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -64,6 +66,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 64 |
env.seed=0 \
|
| 65 |
env.max_steps=15 \
|
| 66 |
env.rollout.n=$group_size \
|
|
|
|
| 67 |
trainer.critic_warmup=0 \
|
| 68 |
trainer.logger=['console','wandb'] \
|
| 69 |
trainer.project_name='verl_agent_webshop' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 66 |
env.seed=0 \
|
| 67 |
env.max_steps=15 \
|
| 68 |
env.rollout.n=$group_size \
|
| 69 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 70 |
trainer.critic_warmup=0 \
|
| 71 |
trainer.logger=['console','wandb'] \
|
| 72 |
trainer.project_name='verl_agent_webshop' \
|
examples/gigpo_trainer/run_alfworld.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -56,6 +58,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 56 |
env.seed=0 \
|
| 57 |
env.max_steps=50 \
|
| 58 |
env.rollout.n=$group_size \
|
|
|
|
| 59 |
trainer.critic_warmup=0 \
|
| 60 |
trainer.logger=['console','wandb'] \
|
| 61 |
trainer.project_name='verl_agent_alfworld' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 58 |
env.seed=0 \
|
| 59 |
env.max_steps=50 \
|
| 60 |
env.rollout.n=$group_size \
|
| 61 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 62 |
trainer.critic_warmup=0 \
|
| 63 |
trainer.logger=['console','wandb'] \
|
| 64 |
trainer.project_name='verl_agent_alfworld' \
|
examples/gigpo_trainer/run_alfworld_lora.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -56,6 +58,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 56 |
env.seed=0 \
|
| 57 |
env.max_steps=50 \
|
| 58 |
env.rollout.n=$group_size \
|
|
|
|
| 59 |
trainer.critic_warmup=0 \
|
| 60 |
trainer.logger=['console','wandb'] \
|
| 61 |
trainer.project_name='verl_agent_alfworld' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 58 |
env.seed=0 \
|
| 59 |
env.max_steps=50 \
|
| 60 |
env.rollout.n=$group_size \
|
| 61 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 62 |
trainer.critic_warmup=0 \
|
| 63 |
trainer.logger=['console','wandb'] \
|
| 64 |
trainer.project_name='verl_agent_alfworld' \
|
examples/gigpo_trainer/run_blackjack.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=32
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -56,6 +58,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 56 |
env.seed=0 \
|
| 57 |
env.max_steps=15 \
|
| 58 |
env.rollout.n=$group_size \
|
|
|
|
| 59 |
trainer.critic_warmup=0 \
|
| 60 |
trainer.logger=['console','wandb'] \
|
| 61 |
trainer.project_name='verl_agent_blackjack' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=32
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 58 |
env.seed=0 \
|
| 59 |
env.max_steps=15 \
|
| 60 |
env.rollout.n=$group_size \
|
| 61 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 62 |
trainer.critic_warmup=0 \
|
| 63 |
trainer.logger=['console','wandb'] \
|
| 64 |
trainer.project_name='verl_agent_blackjack' \
|
examples/gigpo_trainer/run_ezpoints.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -56,6 +58,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 56 |
env.seed=0 \
|
| 57 |
env.max_steps=8 \
|
| 58 |
env.rollout.n=${group_size} \
|
|
|
|
| 59 |
trainer.critic_warmup=0 \
|
| 60 |
trainer.logger=['console','wandb'] \
|
| 61 |
trainer.project_name='verl_agent_ezpoints' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 58 |
env.seed=0 \
|
| 59 |
env.max_steps=8 \
|
| 60 |
env.rollout.n=${group_size} \
|
| 61 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 62 |
trainer.critic_warmup=0 \
|
| 63 |
trainer.logger=['console','wandb'] \
|
| 64 |
trainer.project_name='verl_agent_ezpoints' \
|
examples/gigpo_trainer/run_numberline.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -56,6 +58,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 56 |
env.seed=0 \
|
| 57 |
env.max_steps=10 \
|
| 58 |
env.rollout.n=$group_size \
|
|
|
|
| 59 |
trainer.critic_warmup=0 \
|
| 60 |
trainer.logger=['console','wandb'] \
|
| 61 |
trainer.project_name='verl_agent_numberLine' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 58 |
env.seed=0 \
|
| 59 |
env.max_steps=10 \
|
| 60 |
env.rollout.n=$group_size \
|
| 61 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 62 |
trainer.critic_warmup=0 \
|
| 63 |
trainer.logger=['console','wandb'] \
|
| 64 |
trainer.project_name='verl_agent_numberLine' \
|
examples/gigpo_trainer/run_sokoban.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=32
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -57,6 +59,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 57 |
env.max_steps=15 \
|
| 58 |
env.rollout.n=$group_size \
|
| 59 |
env.sokoban.mode='rgb_array' \
|
|
|
|
| 60 |
trainer.critic_warmup=0 \
|
| 61 |
trainer.logger=['console','wandb'] \
|
| 62 |
trainer.project_name='verl_agent_sokoban' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=32
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 59 |
env.max_steps=15 \
|
| 60 |
env.rollout.n=$group_size \
|
| 61 |
env.sokoban.mode='rgb_array' \
|
| 62 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 63 |
trainer.critic_warmup=0 \
|
| 64 |
trainer.logger=['console','wandb'] \
|
| 65 |
trainer.project_name='verl_agent_sokoban' \
|
examples/gigpo_trainer/run_webshop.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -56,6 +58,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 56 |
env.seed=0 \
|
| 57 |
env.max_steps=15 \
|
| 58 |
env.rollout.n=$group_size \
|
|
|
|
| 59 |
trainer.critic_warmup=0 \
|
| 60 |
trainer.logger=['console','wandb'] \
|
| 61 |
trainer.project_name='verl_agent_webshop' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 58 |
env.seed=0 \
|
| 59 |
env.max_steps=15 \
|
| 60 |
env.rollout.n=$group_size \
|
| 61 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 62 |
trainer.critic_warmup=0 \
|
| 63 |
trainer.logger=['console','wandb'] \
|
| 64 |
trainer.project_name='verl_agent_webshop' \
|
examples/gigpo_trainer/run_webshop_lora.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -57,6 +59,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 57 |
env.seed=0 \
|
| 58 |
env.max_steps=15 \
|
| 59 |
env.rollout.n=$group_size \
|
|
|
|
| 60 |
trainer.critic_warmup=0 \
|
| 61 |
trainer.logger=['console','wandb'] \
|
| 62 |
trainer.project_name='verl_agent_webshop' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 59 |
env.seed=0 \
|
| 60 |
env.max_steps=15 \
|
| 61 |
env.rollout.n=$group_size \
|
| 62 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 63 |
trainer.critic_warmup=0 \
|
| 64 |
trainer.logger=['console','wandb'] \
|
| 65 |
trainer.project_name='verl_agent_webshop' \
|
examples/gigpo_trainer/run_webshop_qwen3.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -54,6 +56,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 54 |
env.seed=0 \
|
| 55 |
env.max_steps=15 \
|
| 56 |
env.rollout.n=$group_size \
|
|
|
|
| 57 |
trainer.critic_warmup=0 \
|
| 58 |
trainer.logger=['console','wandb'] \
|
| 59 |
trainer.project_name='verl_agent_webshop' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 56 |
env.seed=0 \
|
| 57 |
env.max_steps=15 \
|
| 58 |
env.rollout.n=$group_size \
|
| 59 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 60 |
trainer.critic_warmup=0 \
|
| 61 |
trainer.logger=['console','wandb'] \
|
| 62 |
trainer.project_name='verl_agent_webshop' \
|
examples/grpo_trainer/run_alfworld.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -52,6 +54,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 52 |
env.seed=0 \
|
| 53 |
env.max_steps=50 \
|
| 54 |
env.rollout.n=$group_size \
|
|
|
|
| 55 |
trainer.critic_warmup=0 \
|
| 56 |
trainer.logger=['console','wandb'] \
|
| 57 |
trainer.project_name='verl_agent_alfworld' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 54 |
env.seed=0 \
|
| 55 |
env.max_steps=50 \
|
| 56 |
env.rollout.n=$group_size \
|
| 57 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 58 |
trainer.critic_warmup=0 \
|
| 59 |
trainer.logger=['console','wandb'] \
|
| 60 |
trainer.project_name='verl_agent_alfworld' \
|
examples/grpo_trainer/run_balckjack.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=32
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -52,6 +54,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 52 |
env.seed=0 \
|
| 53 |
env.max_steps=15 \
|
| 54 |
env.rollout.n=$group_size \
|
|
|
|
| 55 |
trainer.critic_warmup=0 \
|
| 56 |
trainer.logger=['console','wandb'] \
|
| 57 |
trainer.project_name='verl_agent_blackjack' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=32
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 54 |
env.seed=0 \
|
| 55 |
env.max_steps=15 \
|
| 56 |
env.rollout.n=$group_size \
|
| 57 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 58 |
trainer.critic_warmup=0 \
|
| 59 |
trainer.logger=['console','wandb'] \
|
| 60 |
trainer.project_name='verl_agent_blackjack' \
|
examples/grpo_trainer/run_sokoban.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=32
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -53,6 +55,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 53 |
env.max_steps=15 \
|
| 54 |
env.rollout.n=$group_size \
|
| 55 |
env.sokoban.mode='rgb_array' \
|
|
|
|
| 56 |
trainer.critic_warmup=0 \
|
| 57 |
trainer.logger=['console','wandb'] \
|
| 58 |
trainer.project_name='verl_agent_sokoban' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=32
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 55 |
env.max_steps=15 \
|
| 56 |
env.rollout.n=$group_size \
|
| 57 |
env.sokoban.mode='rgb_array' \
|
| 58 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 59 |
trainer.critic_warmup=0 \
|
| 60 |
trainer.logger=['console','wandb'] \
|
| 61 |
trainer.project_name='verl_agent_sokoban' \
|
examples/grpo_trainer/run_webshop.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=16
|
| 6 |
val_data_size=128
|
| 7 |
group_size=8
|
|
@@ -52,6 +54,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 52 |
env.seed=0 \
|
| 53 |
env.max_steps=15 \
|
| 54 |
env.rollout.n=$group_size \
|
|
|
|
| 55 |
trainer.critic_warmup=0 \
|
| 56 |
trainer.logger=['console','wandb'] \
|
| 57 |
trainer.project_name='verl_agent_webshop' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=16
|
| 8 |
val_data_size=128
|
| 9 |
group_size=8
|
|
|
|
| 54 |
env.seed=0 \
|
| 55 |
env.max_steps=15 \
|
| 56 |
env.rollout.n=$group_size \
|
| 57 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 58 |
trainer.critic_warmup=0 \
|
| 59 |
trainer.logger=['console','wandb'] \
|
| 60 |
trainer.project_name='verl_agent_webshop' \
|
examples/ppo_trainer/run_alfworld.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=128 # match GRPO and GiGPO configuration (16 × 8)
|
| 6 |
val_data_size=128
|
| 7 |
|
|
@@ -56,6 +58,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 56 |
env.env_name=alfworld/AlfredTWEnv \
|
| 57 |
env.seed=0 \
|
| 58 |
env.max_steps=50 \
|
|
|
|
| 59 |
trainer.critic_warmup=0 \
|
| 60 |
trainer.logger=['console','wandb'] \
|
| 61 |
trainer.project_name='verl_agent_alfworld' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=128 # match GRPO and GiGPO configuration (16 × 8)
|
| 8 |
val_data_size=128
|
| 9 |
|
|
|
|
| 58 |
env.env_name=alfworld/AlfredTWEnv \
|
| 59 |
env.seed=0 \
|
| 60 |
env.max_steps=50 \
|
| 61 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 62 |
trainer.critic_warmup=0 \
|
| 63 |
trainer.logger=['console','wandb'] \
|
| 64 |
trainer.project_name='verl_agent_alfworld' \
|
examples/ppo_trainer/run_webshop.sh
CHANGED
|
@@ -2,6 +2,8 @@ set -x
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
|
|
|
|
|
|
| 5 |
train_data_size=128 # match GRPO and GiGPO configuration (16 × 8)
|
| 6 |
val_data_size=128
|
| 7 |
|
|
@@ -56,6 +58,7 @@ python3 -m verl.trainer.main_ppo \
|
|
| 56 |
env.env_name=Webshop \
|
| 57 |
env.seed=0 \
|
| 58 |
env.max_steps=15 \
|
|
|
|
| 59 |
trainer.critic_warmup=0 \
|
| 60 |
trainer.logger=['console','wandb'] \
|
| 61 |
trainer.project_name='verl_agent_webshop' \
|
|
|
|
| 2 |
ENGINE=${1:-vllm}
|
| 3 |
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 4 |
|
| 5 |
+
num_cpus_per_env_worker=0.1 # The CPU resource allocated for each environment worker. If you want to use less CPU resources, you can decrease this value.
|
| 6 |
+
|
| 7 |
train_data_size=128 # match GRPO and GiGPO configuration (16 × 8)
|
| 8 |
val_data_size=128
|
| 9 |
|
|
|
|
| 58 |
env.env_name=Webshop \
|
| 59 |
env.seed=0 \
|
| 60 |
env.max_steps=15 \
|
| 61 |
+
env.resources_per_worker.num_cpus=$num_cpus_per_env_worker \
|
| 62 |
trainer.critic_warmup=0 \
|
| 63 |
trainer.logger=['console','wandb'] \
|
| 64 |
trainer.project_name='verl_agent_webshop' \
|
verl/trainer/config/ppo_trainer.yaml
CHANGED
|
@@ -288,6 +288,9 @@ env:
|
|
| 288 |
seed: 0
|
| 289 |
max_steps: 50
|
| 290 |
history_length: 2
|
|
|
|
|
|
|
|
|
|
| 291 |
rollout:
|
| 292 |
n: -1 # the group number of envs (for GRPO and GiGPO). -1 means disable env grouping.
|
| 293 |
|
|
|
|
| 288 |
seed: 0
|
| 289 |
max_steps: 50
|
| 290 |
history_length: 2
|
| 291 |
+
resources_per_worker: # resources for each env worker
|
| 292 |
+
num_cpus: 0.1
|
| 293 |
+
num_gpus: 0
|
| 294 |
rollout:
|
| 295 |
n: -1 # the group number of envs (for GRPO and GiGPO). -1 means disable env grouping.
|
| 296 |
|