From e7461d7dcf16d56be6651310c352b50657231685 Mon Sep 17 00:00:00 2001 From: Steffen Illium Date: Tue, 3 May 2022 11:32:19 +0200 Subject: [PATCH] Adjustments and Documentation --- environments/factory/factory_dirt.py | 2 +- quickstart/single_agent_train_dirt_env.py | 192 ++++++++++++++++++++++ 2 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 quickstart/single_agent_train_dirt_env.py diff --git a/environments/factory/factory_dirt.py b/environments/factory/factory_dirt.py index 3ba9779..ecd4dc9 100644 --- a/environments/factory/factory_dirt.py +++ b/environments/factory/factory_dirt.py @@ -33,7 +33,7 @@ class RewardsDirt(NamedTuple): class DirtProperties(NamedTuple): initial_dirt_ratio: float = 0.3 # On INIT, on max how many tiles does the dirt spawn in percent. - initial_dirt_spawn_r_var: float = 0.05 # How much does the dirt spawn amount vary? + initial_dirt_spawn_r_var: float = 0.05 # How much does the dirt spawn amount vary? clean_amount: float = 1 # How much does the robot clean with one actions. max_spawn_ratio: float = 0.20 # On max how many tiles does the dirt spawn in percent. max_spawn_amount: float = 0.3 # How much dirt does spawn per tile at max. diff --git a/quickstart/single_agent_train_dirt_env.py b/quickstart/single_agent_train_dirt_env.py new file mode 100644 index 0000000..b2025d1 --- /dev/null +++ b/quickstart/single_agent_train_dirt_env.py @@ -0,0 +1,192 @@ +import sys +import time +from pathlib import Path +from matplotlib import pyplot as plt +import itertools as it + +import stable_baselines3 as sb3 + +try: + # noinspection PyUnboundLocalVariable + if __package__ is None: + DIR = Path(__file__).resolve().parent + sys.path.insert(0, str(DIR.parent)) + __package__ = DIR.name + else: + DIR = None +except NameError: + DIR = None + pass + +import simplejson +from stable_baselines3.common.vec_env import SubprocVecEnv + +from environments import helpers as h +from environments.factory.factory_dirt import DirtProperties, DirtFactory +from environments.logging.envmonitor import EnvMonitor +from environments.utility_classes import MovementProperties, ObservationProperties, AgentRenderOptions +import pickle +from plotting.compare_runs import compare_seed_runs, compare_model_runs +import pandas as pd +import seaborn as sns + +import multiprocessing as mp + +""" +Welcome to this quick start file. Here we will see how to: + 0. Setup I/O Paths + 1. Setup parameters for the environments (dirt-factory). + 2. Setup parameters for the agent training (SB3: PPO) and save metrics. + Run the training. + 3. Save env and agent for later analysis. + 4. Load the agent from drive + 5. Rendering the env with a run of the trained agent. + 6. Plot metrics +""" + +if __name__ == '__main__': + ######################################################### + # 0. Setup I/O Paths + # Define some general parameters + train_steps = 1e6 + n_seeds = 3 + model_class = sb3.PPO + env_class = DirtFactory + + # Define a global studi save path + start_time = int(time.time()) + study_root_path = Path(__file__).parent.parent / 'study_out' / f'{Path(__file__).stem}_{start_time}' + # Create an identifier, which is unique for every combination and easy to read in filesystem + identifier = f'{model_class.__name__}_{env_class.__name__}_{start_time}' + exp_path = study_root_path / identifier + + ######################################################### + # 1. Setup parameters for the environments (dirt-factory). + + + # Define property object parameters. + # 'ObservationProperties' are for specifying how the agent sees the env. + obs_props = ObservationProperties(render_agents=AgentRenderOptions.NOT, # Agents won`t be shown in the obs at all + omit_agent_self=True, # This is default + additional_agent_placeholder=None, # We will not take care of future agents + frames_to_stack=3, # To give the agent a notion of time + pomdp_r=2 # the agents view-radius + ) + # 'MovementProperties' are for specifying how the agent is allowed to move in the env. + move_props = MovementProperties(allow_diagonal_movement=True, # Euclidean style (vertices) + allow_square_movement=True, # Manhattan (edges) + allow_no_op=False) # Pause movement (do nothing) + + # 'DirtProperties' control if and how dirt is spawned + # TODO: Comments + dirt_props = DirtProperties(initial_dirt_ratio=0.35, + initial_dirt_spawn_r_var=0.1, + clean_amount=0.34, + max_spawn_amount=0.1, + max_global_amount=20, + max_local_amount=1, + spawn_frequency=0, + max_spawn_ratio=0.05, + dirt_smear_amount=0.0) + + # These are the EnvKwargs for initializing the env class, holding all former parameter-classes + # TODO: Comments + factory_kwargs = dict(n_agents=1, + max_steps=400, + parse_doors=True, + level_name='rooms', + doors_have_area=True, # + verbose=False, + mv_prop=move_props, # See Above + obs_prop=obs_props, # See Above + done_at_collision=True, + dirt_props=dirt_props + ) + + ######################################################### + # 2. Setup parameters for the agent training (SB3: PPO) and save metrics. + agent_kwargs = dict() + + + ######################################################### + # Run the Training + for seed in range(n_seeds): + # Make a copy if you want to alter things in the training loop; like the seed. + env_kwargs = factory_kwargs.copy() + env_kwargs.update(env_seed=seed) + + # Output folder + seed_path = exp_path / f'{str(seed)}_{identifier}' + seed_path.mkdir(parents=True, exist_ok=True) + + # Parameter Storage + param_path = seed_path / f'env_params.json' + # Observation (measures) Storage + monitor_path = seed_path / 'monitor.pick' + # Model save Path for the trained model + model_save_path = seed_path / f'model.zip' + + # Env Init & Model kwargs definition + with DirtFactory(env_kwargs) as env_factory: + + # EnvMonitor Init + env_monitor_callback = EnvMonitor(env_factory) + + # Model Init + model = model_class("MlpPolicy", env_factory,verbose=1, seed=seed, device='cpu') + + # Model train + model.learn(total_timesteps=int(train_steps), callback=[env_monitor_callback]) + + ######################################################### + # 3. Save env and agent for later analysis. + # Save the trained Model, the monitor (env measures) and the env parameters + model.save(model_save_path) + env_factory.save_params(param_path) + env_monitor_callback.save_run(monitor_path) + + # Compare performance runs, for each seed within a model + try: + compare_seed_runs(exp_path, use_tex=False) + except ValueError: + pass + + # Train ends here ############################################################ + + # Evaluation starts here ##################################################### + # First Iterate over every model and monitor "as trained" + print('Start Measurement Tracking') + # For trained policy in study_root_path / identifier + for policy_path in [x for x in exp_path.iterdir() if x.is_dir()]: + + # retrieve model class + model_cls = next(val for key, val in h.MODEL_MAP.items() if key in policy_path.parent.name) + # Load the agent agent + model = model_cls.load(policy_path / 'model.zip', device='cpu') + # Load old env kwargs + with next(policy_path.glob('*.json')).open('r') as f: + env_kwargs = simplejson.load(f) + # Make the env stop ar collisions + # (you only want to have a single collision per episode hence the statistics) + env_kwargs.update(done_at_collision=True) + + # Init Env + with env_to_run(**env_kwargs) as env_factory: + monitored_env_factory = EnvMonitor(env_factory) + + # Evaluation Loop for i in range(n Episodes) + for episode in range(100): + env_state = monitored_env_factory.reset() + rew, done_bool = 0, False + while not done_bool: + action = model.predict(env_state, deterministic=True)[0] + env_state, step_r, done_bool, info_obj = monitored_env_factory.step(action) + rew += step_r + if done_bool: + break + print(f'Factory run {episode} done, reward is:\n {rew}') + monitored_env_factory.save_run(filepath=policy_path / f'{baseline_monitor_file}.pick') + + # for policy_path in (y for y in policy_path.iterdir() if y.is_dir()): + # load_model_run_baseline(policy_path) + print('Measurements Done')