Individual Rewards

This commit is contained in:
Steffen Illium 2021-11-16 12:14:11 +01:00
parent b6bda84033
commit 0fe90f3ac0
11 changed files with 130 additions and 108 deletions

View File

@ -61,7 +61,8 @@ class BaseFactory(gym.Env):
mv_prop: MovementProperties = MovementProperties(), mv_prop: MovementProperties = MovementProperties(),
obs_prop: ObservationProperties = ObservationProperties(), obs_prop: ObservationProperties = ObservationProperties(),
parse_doors=False, record_episodes=False, done_at_collision=False, parse_doors=False, record_episodes=False, done_at_collision=False,
verbose=False, doors_have_area=True, env_seed=time.time_ns(), **kwargs): verbose=False, doors_have_area=True, env_seed=time.time_ns(), individual_rewards=False,
**kwargs):
if isinstance(mv_prop, dict): if isinstance(mv_prop, dict):
mv_prop = MovementProperties(**mv_prop) mv_prop = MovementProperties(**mv_prop)
@ -94,6 +95,7 @@ class BaseFactory(gym.Env):
self.record_episodes = record_episodes self.record_episodes = record_episodes
self.parse_doors = parse_doors self.parse_doors = parse_doors
self.doors_have_area = doors_have_area self.doors_have_area = doors_have_area
self.individual_rewards = individual_rewards
# Reset # Reset
self.reset() self.reset()
@ -487,31 +489,32 @@ class BaseFactory(gym.Env):
def calculate_reward(self) -> (int, dict): def calculate_reward(self) -> (int, dict):
# Returns: Reward, Info # Returns: Reward, Info
per_agent_info_dict = defaultdict(dict) per_agent_info_dict = defaultdict(dict)
reward = 0 reward = {}
for agent in self[c.AGENT]: for agent in self[c.AGENT]:
per_agent_reward = 0
if self._actions.is_moving_action(agent.temp_action): if self._actions.is_moving_action(agent.temp_action):
if agent.temp_valid: if agent.temp_valid:
# info_dict.update(movement=1) # info_dict.update(movement=1)
reward -= 0.01 per_agent_reward -= 0.01
pass pass
else: else:
reward -= 0.05 per_agent_reward -= 0.05
self.print(f'{agent.name} just hit the wall at {agent.pos}.') self.print(f'{agent.name} just hit the wall at {agent.pos}.')
per_agent_info_dict[agent.name].update({f'{agent.name}_vs_LEVEL': 1}) per_agent_info_dict[agent.name].update({f'{agent.name}_vs_LEVEL': 1})
elif h.EnvActions.USE_DOOR == agent.temp_action: elif h.EnvActions.USE_DOOR == agent.temp_action:
if agent.temp_valid: if agent.temp_valid:
# reward += 0.00 # per_agent_reward += 0.00
self.print(f'{agent.name} did just use the door at {agent.pos}.') self.print(f'{agent.name} did just use the door at {agent.pos}.')
per_agent_info_dict[agent.name].update(door_used=1) per_agent_info_dict[agent.name].update(door_used=1)
else: else:
# reward -= 0.00 # per_agent_reward -= 0.00
self.print(f'{agent.name} just tried to use a door at {agent.pos}, but failed.') self.print(f'{agent.name} just tried to use a door at {agent.pos}, but failed.')
per_agent_info_dict[agent.name].update({f'{agent.name}_failed_door_open': 1}) per_agent_info_dict[agent.name].update({f'{agent.name}_failed_door_open': 1})
elif h.EnvActions.NOOP == agent.temp_action: elif h.EnvActions.NOOP == agent.temp_action:
per_agent_info_dict[agent.name].update(no_op=1) per_agent_info_dict[agent.name].update(no_op=1)
# reward -= 0.00 # per_agent_reward -= 0.00
# Monitor Notes # Monitor Notes
if agent.temp_valid: if agent.temp_valid:
@ -522,7 +525,7 @@ class BaseFactory(gym.Env):
per_agent_info_dict[agent.name].update({f'{agent.name}_failed_action': 1}) per_agent_info_dict[agent.name].update({f'{agent.name}_failed_action': 1})
additional_reward, additional_info_dict = self.calculate_additional_reward(agent) additional_reward, additional_info_dict = self.calculate_additional_reward(agent)
reward += additional_reward per_agent_reward += additional_reward
per_agent_info_dict[agent.name].update(additional_info_dict) per_agent_info_dict[agent.name].update(additional_info_dict)
if agent.temp_collisions: if agent.temp_collisions:
@ -531,6 +534,7 @@ class BaseFactory(gym.Env):
for other_agent in agent.temp_collisions: for other_agent in agent.temp_collisions:
per_agent_info_dict[agent.name].update({f'{agent.name}_vs_{other_agent.name}': 1}) per_agent_info_dict[agent.name].update({f'{agent.name}_vs_{other_agent.name}': 1})
reward[agent.name] = per_agent_reward
# Combine the per_agent_info_dict: # Combine the per_agent_info_dict:
combined_info_dict = defaultdict(lambda: 0) combined_info_dict = defaultdict(lambda: 0)
@ -539,7 +543,13 @@ class BaseFactory(gym.Env):
combined_info_dict[key] += value combined_info_dict[key] += value
combined_info_dict = dict(combined_info_dict) combined_info_dict = dict(combined_info_dict)
self.print(f"reward is {reward}") if self.individual_rewards:
self.print(f"rewards are {reward}")
reward = list(reward.values())
return reward, combined_info_dict
else:
reward = sum(reward.values())
self.print(f"reward is {reward}")
return reward, combined_info_dict return reward, combined_info_dict
def render(self, mode='human'): def render(self, mode='human'):

View File

@ -18,14 +18,15 @@ if __name__ == '__main__':
model_name = 'A2C_ItsDirt' model_name = 'A2C_ItsDirt'
run_id = 0 run_id = 0
determin = True
seed = 67 seed = 67
n_agents = 1 n_agents = 1
out_path = Path('study_out/e_1_ItsDirt/no_obs/dirt/A2C_ItsDirt/0_A2C_ItsDirt') out_path = Path('study_out/e_1_Now_with_doors/no_obs/dirt/A2C_Now_with_doors/0_A2C_Now_with_doors')
model_path = out_path model_path = out_path
with (out_path / f'env_params.json').open('r') as f: with (out_path / f'env_params.json').open('r') as f:
env_kwargs = yaml.load(f, Loader=yaml.FullLoader) env_kwargs = yaml.load(f, Loader=yaml.FullLoader)
env_kwargs.update(additional_agent_placeholder=None, n_agents=n_agents) env_kwargs.update(additional_agent_placeholder=None, n_agents=n_agents, max_steps=150)
if gain_amount := env_kwargs.get('dirt_prop', {}).get('gain_amount', None): if gain_amount := env_kwargs.get('dirt_prop', {}).get('gain_amount', None):
env_kwargs['dirt_prop']['max_spawn_amount'] = gain_amount env_kwargs['dirt_prop']['max_spawn_amount'] = gain_amount
del env_kwargs['dirt_prop']['gain_amount'] del env_kwargs['dirt_prop']['gain_amount']
@ -49,9 +50,9 @@ if __name__ == '__main__':
if n_agents > 1: if n_agents > 1:
actions = [model.predict( actions = [model.predict(
np.stack([env_state[i][j] for i in range(env_state.shape[0])]), np.stack([env_state[i][j] for i in range(env_state.shape[0])]),
deterministic=True)[0] for j, model in enumerate(models)] deterministic=determin)[0] for j, model in enumerate(models)]
else: else:
actions = models[0].predict(env_state, deterministic=True)[0] actions = models[0].predict(env_state, deterministic=determin)[0]
if any([agent.pos in [door.pos for door in env.unwrapped[c.DOORS]] if any([agent.pos in [door.pos for door in env.unwrapped[c.DOORS]]
for agent in env.unwrapped[c.AGENT]]): for agent in env.unwrapped[c.AGENT]]):
print('On Door') print('On Door')

View File

@ -2,6 +2,7 @@ import sys
from pathlib import Path from pathlib import Path
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
import numpy as np import numpy as np
import itertools as it
try: try:
# noinspection PyUnboundLocalVariable # noinspection PyUnboundLocalVariable
@ -70,7 +71,7 @@ baseline_monitor_file = 'e_1_baseline_monitor.pick'
def policy_model_kwargs(): def policy_model_kwargs():
return dict(ent_coef=0.05) return dict()
def dqn_model_kwargs(): def dqn_model_kwargs():
@ -100,6 +101,7 @@ def load_model_run_baseline(seed_path, env_to_run):
# Load old env kwargs # Load old env kwargs
with next(seed_path.glob('*.json')).open('r') as f: with next(seed_path.glob('*.json')).open('r') as f:
env_kwargs = simplejson.load(f) env_kwargs = simplejson.load(f)
env_kwargs.update(done_at_collision=True)
# Monitor Init # Monitor Init
with MonitorCallback(filepath=seed_path / baseline_monitor_file) as monitor: with MonitorCallback(filepath=seed_path / baseline_monitor_file) as monitor:
# Init Env # Init Env
@ -134,6 +136,7 @@ def load_model_run_study(seed_path, env_to_run, additional_kwargs_dict):
env_kwargs = simplejson.load(f) env_kwargs = simplejson.load(f)
env_kwargs.update( env_kwargs.update(
n_agents=n_agents, n_agents=n_agents,
done_at_collision=True,
**additional_kwargs_dict.get('post_training_kwargs', {})) **additional_kwargs_dict.get('post_training_kwargs', {}))
# Monitor Init # Monitor Init
with MonitorCallback(filepath=seed_path / ood_monitor_file) as monitor: with MonitorCallback(filepath=seed_path / ood_monitor_file) as monitor:
@ -168,6 +171,31 @@ def load_model_run_study(seed_path, env_to_run, additional_kwargs_dict):
gc.collect() gc.collect()
def start_mp_study_run(envs_map, policies_path):
paths = list(y for y in policies_path.iterdir() if y.is_dir() and not (y / ood_monitor_file).exists())
if paths:
import multiprocessing as mp
pool = mp.Pool(mp.cpu_count())
print("Starting MP with: ", pool._processes, " Processes")
_ = pool.starmap(load_model_run_study,
it.product(paths,
(envs_map[policies_path.parent.name][0],),
(observation_modes[policies_path.parent.parent.name],))
)
def start_mp_baseline_run(envs_map, policies_path):
paths = list(y for y in policies_path.iterdir() if y.is_dir() and not (y / baseline_monitor_file).exists())
if paths:
import multiprocessing as mp
pool = mp.Pool(mp.cpu_count())
print("Starting MP with: ", pool._processes, " Processes")
_ = pool.starmap(load_model_run_baseline,
it.product(paths,
(envs_map[policies_path.parent.name][0],))
)
if __name__ == '__main__': if __name__ == '__main__':
train_steps = 5e6 train_steps = 5e6
n_seeds = 3 n_seeds = 3
@ -215,75 +243,74 @@ if __name__ == '__main__':
# Define parameter versions according with #1,2[1,0,N],3 # Define parameter versions according with #1,2[1,0,N],3
observation_modes = {} observation_modes = {}
if False: observation_modes.update({
observation_modes.update({ 'seperate_1': dict(
'seperate_1': dict( post_training_kwargs=
post_training_kwargs= dict(obs_prop=ObservationProperties(
dict(obs_prop=ObservationProperties( render_agents=AgentRenderOptions.COMBINED,
render_agents=AgentRenderOptions.COMBINED, additional_agent_placeholder=None,
additional_agent_placeholder=None, omit_agent_self=True,
omit_agent_self=True, frames_to_stack=3,
frames_to_stack=3, pomdp_r=2)
pomdp_r=2) ),
), additional_env_kwargs=
additional_env_kwargs= dict(obs_prop=ObservationProperties(
dict(obs_prop=ObservationProperties( render_agents=AgentRenderOptions.NOT,
render_agents=AgentRenderOptions.NOT, additional_agent_placeholder=1,
additional_agent_placeholder=1, omit_agent_self=True,
omit_agent_self=True, frames_to_stack=3,
frames_to_stack=3, pomdp_r=2)
pomdp_r=2) )
) )})
)}) observation_modes.update({
observation_modes.update({ 'seperate_0': dict(
'seperate_0': dict( post_training_kwargs=
post_training_kwargs= dict(obs_prop=ObservationProperties(
dict(obs_prop=ObservationProperties( render_agents=AgentRenderOptions.COMBINED,
render_agents=AgentRenderOptions.COMBINED, additional_agent_placeholder=None,
additional_agent_placeholder=None, omit_agent_self=True,
omit_agent_self=True, frames_to_stack=3,
frames_to_stack=3, pomdp_r=2)
pomdp_r=2) ),
), additional_env_kwargs=
additional_env_kwargs= dict(obs_prop=ObservationProperties(
dict(obs_prop=ObservationProperties( render_agents=AgentRenderOptions.NOT,
render_agents=AgentRenderOptions.NOT, additional_agent_placeholder=0,
additional_agent_placeholder=0, omit_agent_self=True,
omit_agent_self=True, frames_to_stack=3,
frames_to_stack=3, pomdp_r=2)
pomdp_r=2) )
) )})
)}) observation_modes.update({
observation_modes.update({ 'seperate_N': dict(
'seperate_N': dict( post_training_kwargs=
post_training_kwargs= dict(obs_prop=ObservationProperties(
dict(obs_prop=ObservationProperties( render_agents=AgentRenderOptions.COMBINED,
render_agents=AgentRenderOptions.COMBINED, additional_agent_placeholder=None,
additional_agent_placeholder=None, omit_agent_self=True,
omit_agent_self=True, frames_to_stack=3,
frames_to_stack=3, pomdp_r=2)
pomdp_r=2) ),
), additional_env_kwargs=
additional_env_kwargs= dict(obs_prop=ObservationProperties(
dict(obs_prop=ObservationProperties( render_agents=AgentRenderOptions.NOT,
render_agents=AgentRenderOptions.NOT, additional_agent_placeholder='N',
additional_agent_placeholder='N', omit_agent_self=True,
omit_agent_self=True, frames_to_stack=3,
frames_to_stack=3, pomdp_r=2)
pomdp_r=2) )
) )})
)}) observation_modes.update({
observation_modes.update({ 'in_lvl_obs': dict(
'in_lvl_obs': dict( post_training_kwargs=
post_training_kwargs= dict(obs_prop=ObservationProperties(
dict(obs_prop=ObservationProperties( render_agents=AgentRenderOptions.LEVEL,
render_agents=AgentRenderOptions.LEVEL, omit_agent_self=True,
omit_agent_self=True, additional_agent_placeholder=None,
additional_agent_placeholder=None, frames_to_stack=3,
frames_to_stack=3, pomdp_r=2)
pomdp_r=2) )
) )})
)})
observation_modes.update({ observation_modes.update({
# No further adjustment needed # No further adjustment needed
'no_obs': dict( 'no_obs': dict(
@ -398,15 +425,7 @@ if __name__ == '__main__':
for env_path in [x for x in obs_mode_path.iterdir() if x.is_dir()]: for env_path in [x for x in obs_mode_path.iterdir() if x.is_dir()]:
for policy_path in [x for x in env_path.iterdir() if x. is_dir()]: for policy_path in [x for x in env_path.iterdir() if x. is_dir()]:
# Iteration # Iteration
paths = list(y for y in policy_path.iterdir() if y.is_dir() \ start_mp_baseline_run(env_map, policy_path)
and not (y / baseline_monitor_file).exists())
import multiprocessing as mp
import itertools as it
pool = mp.Pool(mp.cpu_count())
result = pool.starmap(load_model_run_baseline,
it.product(paths,
(env_map[env_path.name][0],))
)
# for seed_path in (y for y in policy_path.iterdir() if y.is_dir()): # for seed_path in (y for y in policy_path.iterdir() if y.is_dir()):
# load_model_run_baseline(seed_path) # load_model_run_baseline(seed_path)
@ -424,18 +443,9 @@ if __name__ == '__main__':
# First seed path version # First seed path version
# seed_path = next((y for y in policy_path.iterdir() if y.is_dir())) # seed_path = next((y for y in policy_path.iterdir() if y.is_dir()))
# Iteration # Iteration
import multiprocessing as mp start_mp_study_run(env_map, policy_path)
import itertools as it #for seed_path in (y for y in policy_path.iterdir() if y.is_dir()):
pool = mp.Pool(mp.cpu_count()) # load_model_run_study(seed_path, env_map[env_path.name][0], observation_modes[obs_mode])
paths = list(y for y in policy_path.iterdir() if y.is_dir() \
and not (y / ood_monitor_file).exists())
# result = pool.starmap(load_model_run_study,
# it.product(paths,
# (env_map[env_path.name][0],),
# (observation_modes[obs_mode],))
# )
for seed_path in (y for y in policy_path.iterdir() if y.is_dir()):
load_model_run_study(seed_path, env_map[env_path.name][0], observation_modes[obs_mode])
print('OOD Tracking Done') print('OOD Tracking Done')
# Plotting # Plotting
@ -497,15 +507,16 @@ if __name__ == '__main__':
# df_melted["Measurements"] = df_melted["Measurement"] + " " + df_melted["monitor"] # df_melted["Measurements"] = df_melted["Measurement"] + " " + df_melted["monitor"]
# Plotting # Plotting
fig, ax = plt.subplots(figsize=(11.7, 8.27)) # fig, ax = plt.subplots(figsize=(11.7, 8.27))
c = sns.catplot(data=df_melted[df_melted['obs_mode'] == observation_folder.name], c = sns.catplot(data=df_melted[df_melted['obs_mode'] == observation_folder.name],
x='Measurement', hue='monitor', row='model', col='env', y='Score', x='Measurement', hue='monitor', row='model', col='env', y='Score',
sharey=False, kind="box", height=4, aspect=.7, legend_out=True, sharey=False, kind="box", height=4, aspect=.7, legend_out=False, legend=False,
showfliers=False) showfliers=False)
c.set_xticklabels(rotation=65, horizontalalignment='right') c.set_xticklabels(rotation=65, horizontalalignment='right')
c.fig.subplots_adjust(top=0.9) # adjust the Figure in rp # c.fig.subplots_adjust(top=0.9) # adjust the Figure in rp
c.fig.suptitle(f"Cat plot for {observation_folder.name}") c.fig.suptitle(f"Cat plot for {observation_folder.name}")
plt.tight_layout(pad=2) # plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.tight_layout()
plt.savefig(study_root_path / f'results_{n_agents}_agents_{observation_folder.name}.png') plt.savefig(study_root_path / f'results_{n_agents}_agents_{observation_folder.name}.png')
pass pass