major redesign ob observations and entittes

This commit is contained in:
Steffen Illium
2023-06-09 14:04:17 +02:00
parent 901fbcbc32
commit c552c35f66
161 changed files with 4458 additions and 4163 deletions

View File

@ -34,9 +34,9 @@ import seaborn as sns
import multiprocessing as mp
"""
In this studie, we want to explore the macro behaviour of multi agents which are trained on the same task,
In this studie, we want to explore the macro behaviour of multi agent which are trained on the same task,
but never saw each other in training.
Those agents learned
Those agent learned
We start with training a single policy on a single task (dirt cleanup / item pickup).
@ -51,16 +51,16 @@ There are further distinctions to be made:
2. Observation in seperate slice - [['seperate_0'], ['seperate_1'], ['seperate_N']]:
- Agents see other entitys on a seperate slice
- This slice has been filled with $0 | 1 | \mathbb{N}(0, 1)$
-- Depending ob the fill value, agents will react diffently
-- Depending ob the fill value, agent will react diffently
-> TODO: Test this!
3. Observation in level slice - ['in_lvl_obs']:
- This tells the agent to treat other agents as obstacle.
- This tells the agent to treat other agent as obstacle.
- However, the state space is altered since moving obstacles are not part the original agent observation.
- We are out of distribution.
4. Obseration (similiar to camera read out) ['in_lvl_0.5', 'in_lvl_n']
- This tells the agent to treat other agents as obstacle, but "sees" them encoded as a different value.
- This tells the agent to treat other agent as obstacle, but "sees" them encoded as a different value.
- However, the state space is altered since moving obstacles are not part the original agent observation.
- We are out of distribution.
"""
@ -96,7 +96,7 @@ def encapsule_env_factory(env_fctry, env_kwrgs):
def load_model_run_baseline(seed_path, env_to_run):
# retrieve model class
model_cls = next(val for key, val in h.MODEL_MAP.items() if key in seed_path.parent.name)
# Load both agents
# Load both agent
model = model_cls.load(seed_path / 'model.zip', device='cpu')
# Load old env kwargs
with next(seed_path.glob('*.json')).open('r') as f:
@ -124,7 +124,7 @@ def load_model_run_study(seed_path, env_to_run, additional_kwargs_dict):
global model_cls
# retrieve model class
model_cls = next(val for key, val in h.MODEL_MAP.items() if key in seed_path.parent.name)
# Load both agents
# Load both agent
models = [model_cls.load(seed_path / 'model.zip', device='cpu') for _ in range(n_agents)]
# Load old env kwargs
with next(seed_path.glob('*.json')).open('r') as f:
@ -331,7 +331,7 @@ if __name__ == '__main__':
for obs_mode in observation_modes.keys():
for env_name in env_names:
for model_cls in [h.MODEL_MAP['A2C']]:
# Create an identifier, which is unique for every combination and easy to read in filesystem
# Create an _identifier, which is unique for every combination and easy to read in filesystem
identifier = f'{model_cls.__name__}_{start_time}'
# Train each combination per seed
combination_path = study_root_path / obs_mode / env_name / identifier
@ -425,7 +425,7 @@ if __name__ == '__main__':
print('Start Baseline Tracking')
for obs_mode in observation_modes:
obs_mode_path = next(x for x in study_root_path.iterdir() if x.is_dir() and x.name == obs_mode)
# For trained policy in study_root_path / identifier
# For trained policy in study_root_path / _identifier
for env_path in [x for x in obs_mode_path.iterdir() if x.is_dir()]:
for policy_path in [x for x in env_path.iterdir() if x. is_dir()]:
# Iteration
@ -440,7 +440,7 @@ if __name__ == '__main__':
print('Start OOD Tracking')
for obs_mode in observation_modes:
obs_mode_path = next(x for x in study_root_path.iterdir() if x.is_dir() and x.name == obs_mode)
# For trained policy in study_root_path / identifier
# For trained policy in study_root_path / _identifier
for env_path in [x for x in obs_mode_path.iterdir() if x.is_dir()]:
for policy_path in [x for x in env_path.iterdir() if x. is_dir()]:
# FIXME: Pick random seed or iterate over available seeds

203
studies/e_1_mix.py Normal file
View File

@ -0,0 +1,203 @@
import itertools
import sys
from pathlib import Path
##############################################
# keep this for stand alone script execution #
##############################################
import numpy as np
try:
# noinspection PyUnboundLocalVariable
if __package__ is None:
DIR = Path(__file__).resolve().parent
sys.path.insert(0, str(DIR.parent))
__package__ = DIR.name
else:
DIR = None
except NameError:
DIR = None
pass
##############################################
##############################################
##############################################
import simplejson
from stable_baselines3.common.vec_env import SubprocVecEnv
from environments import helpers as h
from environments.factory.factory_dirt import DirtFactory
from environments.factory.dirt_util import DirtProperties
from environments.factory.combined_factories import DirtItemFactory
from environments.factory.factory_item import ItemFactory
from environments.factory.additional.item.item_util import ItemProperties
from environments.logging.envmonitor import MonitorCallback
from environments.utility_classes import MovementProperties
from plotting.compare_runs import compare_seed_runs, compare_model_runs, compare_all_parameter_runs
# Define a global studi save path
start_time = 1631709932 # int(time.time())
study_root_path = (Path('..') if not DIR else Path()) / 'study_out' / f'e_1_{start_time}'
"""
In this studie, we want to explore the macro behaviour of multi agent which are trained on different tasks
in the same environment, but never saw each other in training.
Those agent learned to maximize the individual environmental goal in {dirt clean, item pickup, mixed}
We start with agent that have been trained on a single task (dirt cleanup / item pickup / mixed).
We then mix two agent with any policy.
There are further distinctions to be made:
1. No Observation - ['no_obs']:
- Agent do not see each other but their consequences of their combined actions
- Agents can collide
2. Observation in seperate slice - [['seperate_0'], ['seperate_1'], ['seperate_N']]:
- Agents see other entitys on a seperate slice
- This slice has been filled with $0 | 1 | \mathbb{N}(0, 1)$
-- Depending ob the fill value, agent will react differently
-> TODO: Test this!
3. Observation in level slice - ['in_lvl_obs']:
- This tells the agent to treat other agent as obstacle.
- However, the state space is altered since moving obstacles are not part the original agent observation.
- We are out of distribution.
"""
def policy_model_kwargs():
return dict(ent_coef=0.01)
def dqn_model_kwargs():
return dict(buffer_size=50000,
learning_starts=64,
batch_size=64,
target_update_interval=5000,
exploration_fraction=0.25,
exploration_final_eps=0.025
)
def encapsule_env_factory(env_fctry, env_kwrgs):
def _init():
with env_fctry(**env_kwrgs) as init_env:
return init_env
return _init
if __name__ == '__main__':
# Define Global Env Parameters
# Define properties object parameters
move_props = MovementProperties(allow_diagonal_movement=True,
allow_square_movement=True,
allow_no_op=False)
dirt_props = DirtProperties(clean_amount=2, gain_amount=0.1, max_global_amount=20,
max_local_amount=1, spawn_frequency=15, max_spawn_ratio=0.05,
dirt_smear_amount=0.0, agent_can_interact=True)
item_props = ItemProperties(n_items=10, agent_can_interact=True,
spawn_frequency=30, n_drop_off_locations=2,
max_agent_inventory_capacity=15)
factory_kwargs = dict(n_agents=1,
pomdp_r=2, max_steps=400, parse_doors=False,
level_name='rooms', frames_to_stack=3,
omit_agent_in_obs=True, combin_agent_obs=True, record_episodes=False,
cast_shadows=True, doors_have_area=False, verbose=False,
movement_properties=move_props
)
# Bundle both environments with global kwargs and parameters
env_map = {'dirt': (DirtFactory, dict(dirt_properties=dirt_props, **factory_kwargs)),
'item': (ItemFactory, dict(item_properties=item_props, **factory_kwargs)),
'itemdirt': (DirtItemFactory, dict(dirt_properties=dirt_props, item_properties=item_props,
**factory_kwargs))}
env_names = list(env_map.keys())
# Define parameter versions according with #1,2[1,0,N],3
observation_modes = {
# Fill-value = 0
'seperate_0': dict(additional_env_kwargs=dict(additional_agent_placeholder=0),
post_training_env_kwargs=dict(omit_agent_in_obs=True,
combin_agent_obs=False)
),
# Fill-value = 1
'seperate_1': dict(additional_env_kwargs=dict(additional_agent_placeholder=1),
post_training_env_kwargs=dict(omit_agent_in_obs=True,
combin_agent_obs=False)
),
# Fill-value = N(0, 1)
'seperate_N': dict(additional_env_kwargs=dict(additional_agent_placeholder='N'),
post_training_env_kwargs=dict(omit_agent_in_obs=True,
combin_agent_obs=False)
),
# Further ADjustments are done post-training
'in_lvl_obs': dict(post_training_kwargs=dict(other_agent_obs='in_lvl'),
),
# No further adjustment needed
'no_obs': None
}
# Evaluation starts here #####################################################
# Iterate Observation Modes
for observation_mode in observation_modes:
obs_mode_path = next(x for x in study_root_path.iterdir() if x.is_dir() and x.name == observation_mode)
# For trained policy in study_root_path / _identifier
for env_paths in itertools.combinations([x for x in obs_mode_path.iterdir() if x.is_dir()], 2):
policy_path_zip = zip(*[[x for x in env_paths[i].iterdir() if x.is_dir()] for i in range(len(env_paths))])
for policy_paths in policy_path_zip:
# TODO: Pick random seed or iterate over available seeds
# First seed path version
# policy_path = next((y for y in policy_path.iterdir() if y.is_dir()))
# Iteration
seed_path_zip = zip(*[[y for y in policy_paths[i].iterdir() if y.is_dir()] for i in range(len(policy_paths))])
for seed_paths in seed_path_zip:
# retrieve model class
for model_cls in (val for key, val in h.MODEL_MAP.items() if key in policy_paths[0].name):
# Load both agent
models = [model_cls.load(seed_paths[i] / 'model.zip') for i in range(len(seed_paths))]
# Load old env kwargs
with next(seed_paths[0].glob('*.json')).open('r') as f:
env_kwargs = simplejson.load(f)
# Update Kwargs to account for multiple agent etc.
env_kwargs.update(n_agents=len(models), additional_agent_placeholder=None,
**observation_modes[observation_mode].get('post_training_env_kwargs', {}))
# EnvMonitor Init
comb = f'combination_{"_".join([env_paths[i].name for i in range(len(env_paths))])}'
comb_monitor_path = obs_mode_path / comb / 'e_1_mix_monitor.pick'
comb_monitor_path.parent.mkdir(parents=True, exist_ok=True)
with MonitorCallback(filepath=comb_monitor_path) as monitor:
# Init Env
env = env_map['itemdirt'][0](**env_kwargs)
# Evaluation Loop for i in range(n Episodes)
for episode in range(50):
obs = env.reset()
rew, done_bool = 0, False
while not done_bool:
actions = []
for i, model in enumerate(models):
if ptk := observation_modes[observation_mode].get('post_training_kwargs', {}):
if ptk.get('other_agent_obs', '') == 'in_lvl':
a_obs = np.concatenate(
((obs[i][0] + (obs[i][1] == 1).astype(np.float32))[None, ...],
obs[i][2:])
)
else:
a_obs = obs[i]
else:
a_obs = obs[i]
actions.append(model.predict(obs[i], deterministic=False)[0])
env_state, step_r, done_bool, info_obj = env.step(actions)
monitor._read_info(0, info_obj)
rew += step_r
if done_bool:
monitor._read_done(0, done_bool)
break
print(f'Factory run {episode} done, reward is:\n {rew}')
# Eval monitor outputs are automatically stored by the monitor object
# TODO: Plotting
pass

View File

@ -47,7 +47,7 @@ def encapsule_env_factory(env_fctry, env_kwrgs):
def load_model_run_baseline(policy_path, env_to_run):
# retrieve model class
model_cls = h.MODEL_MAP['A2C']
# Load both agents
# Load both agent
model = model_cls.load(policy_path / 'model.zip', device='cpu')
# Load old env kwargs
with next(policy_path.glob('*params.json')).open('r') as f:
@ -76,7 +76,7 @@ def load_model_run_baseline(policy_path, env_to_run):
def load_model_run_combined(root_path, env_to_run, env_kwargs):
# retrieve model class
model_cls = h.MODEL_MAP['A2C']
# Load both agents
# Load both agent
models = [model_cls.load(model_zip, device='cpu') for model_zip in root_path.rglob('model.zip')]
# Load old env kwargs
env_kwargs = env_kwargs.copy()
@ -252,7 +252,7 @@ if __name__ == '__main__':
if individual_run:
print('Start Individual Recording')
for env_key in (env_key for env_key in env_map if 'combined' != env_key):
# For trained policy in study_root_path / identifier
# For trained policy in study_root_path / _identifier
policy_path = study_root_path / env_key
load_model_run_baseline(policy_path, env_map[policy_path.name][0])
@ -264,7 +264,7 @@ if __name__ == '__main__':
if combined_run:
print('Start combined run')
for env_key in (env_key for env_key in env_map if 'combined' == env_key):
# For trained policy in study_root_path / identifier
# For trained policy in study_root_path / _identifier
factory, kwargs = env_map[env_key]
load_model_run_combined(study_root_path, factory, kwargs)
print('OOD Tracking Done')

187
studies/test.py Normal file
View File

@ -0,0 +1,187 @@
import sys
from pathlib import Path
from stable_baselines3.common.vec_env import SubprocVecEnv
try:
# noinspection PyUnboundLocalVariable
if __package__ is None:
DIR = Path(__file__).resolve().parent
sys.path.insert(0, str(DIR.parent))
__package__ = DIR.name
else:
DIR = None
except NameError:
DIR = None
pass
import simplejson
from environments.logging.recorder import EnvRecorder
from environments import helpers as h
from environments.factory.factory_dirt import DirtFactory
from environments.factory.dirt_util import DirtProperties
from environments.factory.factory_item import ItemFactory
from environments.factory.additional.item.item_util import ItemProperties
from environments.logging.envmonitor import EnvMonitor
from environments.utility_classes import MovementProperties, ObservationProperties, AgentRenderOptions
"""
In this studie, we want to export trained Agents for debugging purposes.
"""
def encapsule_env_factory(env_fctry, env_kwrgs):
def _init():
with env_fctry(**env_kwrgs) as init_env:
return init_env
return _init
def load_model_run_baseline(policy_path, env_to_run):
# retrieve model class
model_cls = h.MODEL_MAP['A2C']
# Load both agent
model = model_cls.load(policy_path / 'model.zip', device='cpu')
# Load old env kwargs
with next(policy_path.glob('*params.json')).open('r') as f:
env_kwargs = simplejson.load(f)
env_kwargs.update(done_at_collision=True)
# Init Env
with env_to_run(**env_kwargs) as env_factory:
monitored_env_factory = EnvMonitor(env_factory)
recorded_env_factory = EnvRecorder(monitored_env_factory)
# Evaluation Loop for i in range(n Episodes)
for episode in range(5):
env_state = recorded_env_factory.reset()
rew, done_bool = 0, False
while not done_bool:
action = model.predict(env_state, deterministic=True)[0]
env_state, step_r, done_bool, info_obj = recorded_env_factory.step(action)
rew += step_r
if done_bool:
break
print(f'Factory run {episode} done, reward is:\n {rew}')
recorded_env_factory.save_run(filepath=policy_path / f'monitor.pick')
recorded_env_factory.save_records(filepath=policy_path / f'recorder.json')
if __name__ == '__main__':
# What to do:
train = True
individual_run = True
combined_run = False
multi_env = False
train_steps = 2e6
frames_to_stack = 3
# Define a global studi save path
study_root_path = Path(__file__).parent.parent / 'study_out' / f'{Path(__file__).stem}'
def policy_model_kwargs():
return dict()
# Define Global Env Parameters
# Define properties object parameters
obs_props = ObservationProperties(render_agents=AgentRenderOptions.NOT,
additional_agent_placeholder=None,
omit_agent_self=True,
frames_to_stack=frames_to_stack,
pomdp_r=2, cast_shadows=True)
move_props = MovementProperties(allow_diagonal_movement=True,
allow_square_movement=True,
allow_no_op=False)
dirt_props = DirtProperties(initial_dirt_ratio=0.35, initial_dirt_spawn_r_var=0.1,
clean_amount=0.34,
max_spawn_amount=0.1, max_global_amount=20,
max_local_amount=1, spawn_frequency=0, max_spawn_ratio=0.05,
dirt_smear_amount=0.0, agent_can_interact=True)
item_props = ItemProperties(n_items=10, spawn_frequency=30, n_drop_off_locations=2,
max_agent_inventory_capacity=15)
factory_kwargs = dict(n_agents=1, max_steps=500, parse_doors=True,
level_name='rooms', doors_have_area=True,
verbose=False,
mv_prop=move_props,
obs_prop=obs_props,
done_at_collision=False
)
# Bundle both environments with global kwargs and parameters
env_map = {}
env_map.update({'dirt': (DirtFactory, dict(dirt_prop=dirt_props,
**factory_kwargs.copy()))})
env_map.update({'item': (ItemFactory, dict(item_prop=item_props,
**factory_kwargs.copy()))})
# env_map.update({'dest': (DestFactory, dict(dest_prop=dest_props,
# **factory_kwargs.copy()))})
env_names = list(env_map.keys())
# Train starts here ############################################################
# Build Major Loop parameters, parameter versions, Env Classes and models
if train:
for env_key in (env_key for env_key in env_map if 'combined' != env_key):
model_cls = h.MODEL_MAP['A2C']
combination_path = study_root_path / env_key
env_class, env_kwargs = env_map[env_key]
# Output folder
if (combination_path / 'monitor.pick').exists():
continue
combination_path.mkdir(parents=True, exist_ok=True)
if not multi_env:
env_factory = encapsule_env_factory(env_class, env_kwargs)()
else:
env_factory = SubprocVecEnv([encapsule_env_factory(env_class, env_kwargs)
for _ in range(6)], start_method="spawn")
param_path = combination_path / f'env_params.json'
try:
env_factory.env_method('save_params', param_path)
except AttributeError:
env_factory.save_params(param_path)
# EnvMonitor Init
callbacks = [EnvMonitor(env_factory)]
# Model Init
model = model_cls("MlpPolicy", env_factory, **policy_model_kwargs(),
verbose=1, seed=69, device='cpu')
# Model train
model.learn(total_timesteps=int(train_steps), callback=callbacks)
# Model save
save_path = combination_path / f'model.zip'
model.save(save_path)
# Monitor Save
callbacks[0].save_run(combination_path / 'monitor.pick')
# Better be save then sorry: Clean up!
del env_factory, model
import gc
gc.collect()
# Train ends here ############################################################
# Evaluation starts here #####################################################
# First Iterate over every model and monitor "as trained"
if individual_run:
print('Start Individual Recording')
for env_key in (env_key for env_key in env_map if 'combined' != env_key):
# For trained policy in study_root_path / _identifier
policy_path = study_root_path / env_key
load_model_run_baseline(policy_path, env_map[policy_path.name][0])
# for policy_path in (y for y in policy_path.iterdir() if y.is_dir()):
# load_model_run_baseline(policy_path)
print('Done Individual Recording')