Merge branch 'master' into semimdp

markkho · May 24, 2023 · df63214 · df63214
2 parents 0af9cae + 8e6d77b
commit df63214
Show file tree

Hide file tree

Showing 2 changed files with 135 additions and 48 deletions.
diff --git a/msdm/algorithms/search.py b/msdm/algorithms/search.py
@@ -51,16 +51,16 @@ def __init__(self, *, seed=None, randomize_action_order=False):
         self.seed = seed
         self.randomize_action_order = randomize_action_order
 
-    def plan_on(self, dss: DeterministicShortestPathProblem):
+    def plan_on(self, dsp: MarkovDecisionProcess):
         rnd = random.Random(self.seed)
         if self.randomize_action_order:
             shuffled = make_shuffled(rnd)
         else:
             shuffled = lambda list: list
-
-        dss = DeterministicShortestPathProblem.from_mdp(dss)
 
-        start = dss.initial_state()
+        dsp = DeterministicShortestPathProblem.from_mdp(dsp)
+
+        start = dsp.initial_state()
 
         queue = collections.deque([start])
 
@@ -70,18 +70,18 @@ def plan_on(self, dss: DeterministicShortestPathProblem):
         while queue:
             s = queue.popleft()
 
-            if dss.is_absorbing(s):
+            if dsp.is_absorbing(s):
                 path = reconstruct_path(camefrom, start, s)
                 return Result(
                     path=path,
-                    policy=camefrom_to_policy(path, camefrom, dss),
+                    policy=camefrom_to_policy(path, camefrom, dsp),
                     visited=visited,
                 )
 
             visited.add(s)
 
-            for a in shuffled(dss.actions(s)):
-                ns = dss.next_state(s, a)
+            for a in shuffled(dsp.actions(s)):
+                ns = dsp.next_state(s, a)
                 if ns not in visited and ns not in queue:
                     queue.append(ns)
                     camefrom[ns] = (s, a)
@@ -99,49 +99,66 @@ def __init__(
         heuristic_value=lambda s: 0,
         seed=None,
         randomize_action_order=False,
-        # tie_breaking_strategy="random",
+        tie_breaking_strategy='lifo'
     ):
         self.heuristic_value = heuristic_value
         self.seed = seed
-        # self.tie_breaking_strategy = tie_breaking_strategy
         self.randomize_action_order = randomize_action_order
+        assert tie_breaking_strategy in ['random', 'lifo', 'fifo']
+        self.tie_breaking_strategy = tie_breaking_strategy
 
-    def plan_on(self, dss: DeterministicShortestPathProblem):
+    def plan_on(self, dsp: MarkovDecisionProcess):
         rnd = random.Random(self.seed)
         if self.randomize_action_order:
             shuffled = make_shuffled(rnd)
         else:
             shuffled = lambda list: list
-        
-        dss = DeterministicShortestPathProblem.from_mdp(dss)
+
+        dsp = DeterministicShortestPathProblem.from_mdp(dsp)
 
         # Every queue entry is a pair of
-        # - a tuple of priorities/costs (the cost-to-go, cost-so-far, and a random tie-breaker)
+        # - a tuple of priorities/costs (the cost-to-go, a tie-breaker, and cost-so-far)
         # - the state
         queue = []
-        start = dss.initial_state()
-        heapq.heappush(queue, ((-self.heuristic_value(start), 0, rnd.random()), start))
+        start = dsp.initial_state()
+        if self.tie_breaking_strategy in ['lifo', 'fifo']:
+            tie_break = 0
+            if self.tie_breaking_strategy == 'lifo':
+                # The heap is a min-heap, so to ensure last-in first-out
+                # the tie-breaker must decrease. Since it's always
+                # decreasing, later elements of equivalent value have greater priority.
+                tie_break_delta = -1
+            else:
+                # See above comment. First-in first-out requires that our tie-breaker increases.
+                tie_break_delta = +1
+        else:
+            tie_break = rnd.random()
+        heapq.heappush(queue, ((-self.heuristic_value(start), tie_break, 0), start))
 
         visited = set([])
         camefrom = dict()
 
         while queue:
-            (heuristic_cost, cost_from_start, _), s = heapq.heappop(queue)
+            (heuristic_cost, _, cost_from_start), s = heapq.heappop(queue)
 
-            if dss.is_absorbing(s):
+            if dsp.is_absorbing(s):
                 path = reconstruct_path(camefrom, start, s)
                 return Result(
                     path=path,
-                    policy=camefrom_to_policy(path, camefrom, dss),
+                    policy=camefrom_to_policy(path, camefrom, dsp),
                     visited=visited,
                 )
 
             visited.add(s)
 
-            for a in shuffled(dss.actions(s)):
-                ns = dss.next_state(s, a)
+            for a in shuffled(dsp.actions(s)):
+                ns = dsp.next_state(s, a)
                 if ns not in visited and ns not in [el[-1] for el in queue]:
-                    next_cost_from_start = cost_from_start - dss.reward(s, a, ns)
+                    next_cost_from_start = cost_from_start - dsp.reward(s, a, ns)
                     next_heuristic_cost = next_cost_from_start - self.heuristic_value(ns)
-                    heapq.heappush(queue, ((next_heuristic_cost, next_cost_from_start, rnd.random()), ns))
+                    if self.tie_breaking_strategy in ['lifo', 'fifo']:
+                        tie_break += tie_break_delta
+                    else:
+                        tie_break = rnd.random()
+                    heapq.heappush(queue, ((next_heuristic_cost, tie_break, next_cost_from_start), ns))
                     camefrom[ns] = (s, a)
diff --git a/msdm/tests/test_search.py b/msdm/tests/test_search.py
@@ -1,6 +1,6 @@
 from msdm.algorithms import BreadthFirstSearch, AStarSearch
 from msdm.core.mdp import MarkovDecisionProcess
-from msdm.domains import GridWorld
+from msdm.domains.gridworld.mdp import GridWorld, TERMINALSTATE
 from msdm.tests.domains import DeterministicCounter
 
 gw = GridWorld(
@@ -16,6 +16,19 @@
     discount_rate=1.0
 )
 
+empty_gw = GridWorld(
+    tile_array=[
+        '....g',
+        '.....',
+        '.....',
+        '.....',
+        's....',
+    ],
+    feature_rewards={'g': 0},
+    step_cost=-1,
+    discount_rate=1.0
+)
+
 def test_bfs():
     res = BreadthFirstSearch().plan_on(gw)
     assert [(s['x'], s['y']) for s in res.path] == [(0, 2), (1, 2), (2, 2), (-1, -1)]
@@ -24,28 +37,85 @@ def test_deterministic_shortest_path():
     res = BreadthFirstSearch().plan_on(DeterministicCounter(3))
     assert res.path == [0, 1, 2, 3]
 
+def make_manhattan_distance_heuristic(mdp : MarkovDecisionProcess):
+    def manhattan_distance_heuristic(s):
+        if mdp.is_absorbing(s):
+            return 0
+        goal = mdp.absorbing_states[0]
+        dist = abs(s['x'] - goal['x']) + abs(s['y'] - goal['y'])
+        return -dist
+    return manhattan_distance_heuristic
+
 def test_astarsearch():
-    soln1 = [(0, 2), (0, 3), (0, 4), (1, 4), (2, 4), (2, 3), (2, 2), (-1, -1)]
-    soln2 = [(0, 2), (0, 1), (0, 0), (1, 0), (2, 0), (2, 1), (2, 2), (-1, -1)]
-    def make_manhattan_distance_heuristic(mdp : MarkovDecisionProcess):
-        def manhattan_distance_heuristic(s):
-            if mdp.is_absorbing(s):
-                return 0
-            goal = mdp.absorbing_states[0]
-            dist = abs(s['x'] - goal['x']) + abs(s['y'] - goal['y'])
-            return -dist
-        return manhattan_distance_heuristic
-    planner = AStarSearch(
-        heuristic_value=make_manhattan_distance_heuristic(gw),
-        randomize_action_order=False,
-        seed=42
-    )
-    res = planner.plan_on(gw)
-    assert [(s['x'], s['y']) for s in res.path] == soln1
-    planner = AStarSearch(
-        heuristic_value=make_manhattan_distance_heuristic(gw),
-        randomize_action_order=True,
-        seed=42
-    )
-    res = planner.plan_on(gw)
-    assert [(s['x'], s['y']) for s in res.path] == soln2
+    path_above = ((0, 2), (0, 3), (0, 4), (1, 4), (2, 4), (2, 3), (2, 2), (-1, -1))
+    path_below = ((0, 2), (0, 1), (0, 0), (1, 0), (2, 0), (2, 1), (2, 2), (-1, -1))
+
+    found = set()
+    for seed in range(10):
+        planner = AStarSearch(
+            heuristic_value=make_manhattan_distance_heuristic(gw),
+            randomize_action_order=False,
+            seed=seed + 42,
+        )
+        res = planner.plan_on(gw)
+        found.add(tuple([(s['x'], s['y']) for s in res.path]))
+    # We deterministically select one of the two paths, because we tie-break with LIFO.
+    # Depends on random seed.
+    assert found == {path_below}
+
+    found = set()
+    for seed in range(10):
+        planner = AStarSearch(
+            heuristic_value=make_manhattan_distance_heuristic(gw),
+            randomize_action_order=True,
+            seed=seed + 42
+        )
+        res = planner.plan_on(gw)
+        found.add(tuple([(s['x'], s['y']) for s in res.path]))
+    # When action order is randomized, each of the optimal solutions are possible.
+    # Depends on random seed, but likely.
+    assert found == {path_above, path_below}
+
+def test_astarsearch_tie_breaking():
+    def _plan(kw):
+        planner = AStarSearch(
+            heuristic_value=make_manhattan_distance_heuristic(empty_gw),
+            randomize_action_order=False,
+            seed=kw.pop('seed', 42),
+            **kw,
+        )
+        return planner.plan_on(empty_gw)
+
+    def _check_path(res):
+        assert res.path[-1] == TERMINALSTATE
+        assert len(res.path) - 1 == path_len
+
+    path_len = 9
+    state_count = 25
+    assert len(set(empty_gw.state_list) - {TERMINALSTATE}) == state_count
+
+    # LIFO tie-breaking searches fewer states with equivalent cost + heuristic, because it
+    # prioritizes more recently added entries to the queue. For the empty grid world we consider here,
+    # the heuristic has no error, so cost + heuristic at all states is equal to the negative
+    # value of the start state. LIFO ensures we focus on more recently visited states, which
+    # means we only have to visit the states we wind up including in our path.
+    res = _plan(dict(tie_breaking_strategy='lifo'))
+    assert len(res.visited) == path_len
+    assert res.visited == set(res.path[:-1])
+    _check_path(res)
+
+    # By contrast, FIFO is more thorough, prioritizing states added earlier. For this domain,
+    # that means we wind up exploring the entire state space, since they have the same cost + heuristic.
+    res = _plan(dict(tie_breaking_strategy='fifo'))
+    assert len(res.visited) == state_count
+    assert res.visited == set(empty_gw.state_list) - {TERMINALSTATE}
+    _check_path(res)
+
+    # Visits with random tie-breaking will be bounded by the above two extremes.
+    for seed in range(100):
+        res = _plan(dict(tie_breaking_strategy='random', seed=seed + 47283674))
+        # This is guaranteed.
+        assert path_len <= len(res.visited) <= state_count
+        # This stricter bound will depend on the random seed, but is likely.
+        assert path_len < len(res.visited) < state_count
+        _check_path(res)