fix strange behaviours with best_reward

37cb43af · brian.lee · bbde9a45 · 37cb43af
Commit 37cb43af authored 5 years ago by brian.lee
--- a/DecMCTS.py
+++ b/DecMCTS.py
+from __future__ import print_function
+
 import networkx as nx
 from copy import copy
 from math import log
 import numpy as np

-
 def _UCT(mu_j, c_p, n_p, n_j):
    if n_j ==0:
        return float("Inf")
@@ -111,6 +112,7 @@ class Tree:
        self.graph.add_node(1, 
                mu=0, 
                N=0, 
+                best_reward = 0,
                state=self.state_store(self.data, None, None, self.id)
                )

@@ -212,6 +214,7 @@ class Tree:

            self.graph.add_node(len(self.graph)+1,
                    mu = 0,
+                    best_reward = 0,
                    N = 0,
                    state=self.state_store(self.data, self.graph.node[start_node]["state"], o, self.id)
                    )
@@ -240,7 +243,6 @@ class Tree:
        ### EXPANSION
        # check if _expansion changes start_node to the node after jumping
        self._expansion(start_node)
-        print(self._childNodes(start_node))

        ### SIMULATION
        avg_reward = 0
@@ -274,15 +276,17 @@ class Tree:
                state[self.id] = temp_state

            # calculate the reward at the end of simulation
-            rew = self.reward(self.data, state) \
-                - self.reward(self.data, self._null_state(state))
-
+            rew = self.reward(self.data, state) 
+            avg_reward += rew
            # if best reward so far, store the rollout in the new node
            if rew > best_reward:
                best_reward = rew
                best_rollout = copy(temp_state)
-                self.graph.node[start_node]["mu"] = avg_reward

+        avg_reward = avg_reward / nsims
+
+        self.graph.node[start_node]["mu"] = avg_reward
+        self.graph.node[start_node]["best_reward"] = best_reward
        self.graph.node[start_node]["N"] = 1
        self.graph.node[start_node]["best_rollout"] = copy(best_rollout)

@@ -299,6 +303,10 @@ class Tree:
            self.graph.node[start_node]["N"] = \
                    gamma * self.graph.node[start_node]["N"] + 1

+            if best_reward > self.graph.node[start_node]["best_reward"]:
+                self.graph.node[start_node]["best_reward"] = best_reward
+                self.graph.node[start_node]["best_rollout"] = copy(best_rollout)
+
        self._update_distribution()

        return avg_reward