Skip to content

Commit

Permalink
Merge branch 'master' into semimdp
Browse files Browse the repository at this point in the history
  • Loading branch information
markkho committed May 24, 2023
2 parents 0af9cae + 8e6d77b commit df63214
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 48 deletions.
63 changes: 40 additions & 23 deletions msdm/algorithms/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,16 @@ def __init__(self, *, seed=None, randomize_action_order=False):
self.seed = seed
self.randomize_action_order = randomize_action_order

def plan_on(self, dss: DeterministicShortestPathProblem):
def plan_on(self, dsp: MarkovDecisionProcess):
rnd = random.Random(self.seed)
if self.randomize_action_order:
shuffled = make_shuffled(rnd)
else:
shuffled = lambda list: list

dss = DeterministicShortestPathProblem.from_mdp(dss)

start = dss.initial_state()
dsp = DeterministicShortestPathProblem.from_mdp(dsp)

start = dsp.initial_state()

queue = collections.deque([start])

Expand All @@ -70,18 +70,18 @@ def plan_on(self, dss: DeterministicShortestPathProblem):
while queue:
s = queue.popleft()

if dss.is_absorbing(s):
if dsp.is_absorbing(s):
path = reconstruct_path(camefrom, start, s)
return Result(
path=path,
policy=camefrom_to_policy(path, camefrom, dss),
policy=camefrom_to_policy(path, camefrom, dsp),
visited=visited,
)

visited.add(s)

for a in shuffled(dss.actions(s)):
ns = dss.next_state(s, a)
for a in shuffled(dsp.actions(s)):
ns = dsp.next_state(s, a)
if ns not in visited and ns not in queue:
queue.append(ns)
camefrom[ns] = (s, a)
Expand All @@ -99,49 +99,66 @@ def __init__(
heuristic_value=lambda s: 0,
seed=None,
randomize_action_order=False,
# tie_breaking_strategy="random",
tie_breaking_strategy='lifo'
):
self.heuristic_value = heuristic_value
self.seed = seed
# self.tie_breaking_strategy = tie_breaking_strategy
self.randomize_action_order = randomize_action_order
assert tie_breaking_strategy in ['random', 'lifo', 'fifo']
self.tie_breaking_strategy = tie_breaking_strategy

def plan_on(self, dss: DeterministicShortestPathProblem):
def plan_on(self, dsp: MarkovDecisionProcess):
rnd = random.Random(self.seed)
if self.randomize_action_order:
shuffled = make_shuffled(rnd)
else:
shuffled = lambda list: list
dss = DeterministicShortestPathProblem.from_mdp(dss)

dsp = DeterministicShortestPathProblem.from_mdp(dsp)

# Every queue entry is a pair of
# - a tuple of priorities/costs (the cost-to-go, cost-so-far, and a random tie-breaker)
# - a tuple of priorities/costs (the cost-to-go, a tie-breaker, and cost-so-far)
# - the state
queue = []
start = dss.initial_state()
heapq.heappush(queue, ((-self.heuristic_value(start), 0, rnd.random()), start))
start = dsp.initial_state()
if self.tie_breaking_strategy in ['lifo', 'fifo']:
tie_break = 0
if self.tie_breaking_strategy == 'lifo':
# The heap is a min-heap, so to ensure last-in first-out
# the tie-breaker must decrease. Since it's always
# decreasing, later elements of equivalent value have greater priority.
tie_break_delta = -1
else:
# See above comment. First-in first-out requires that our tie-breaker increases.
tie_break_delta = +1
else:
tie_break = rnd.random()
heapq.heappush(queue, ((-self.heuristic_value(start), tie_break, 0), start))

visited = set([])
camefrom = dict()

while queue:
(heuristic_cost, cost_from_start, _), s = heapq.heappop(queue)
(heuristic_cost, _, cost_from_start), s = heapq.heappop(queue)

if dss.is_absorbing(s):
if dsp.is_absorbing(s):
path = reconstruct_path(camefrom, start, s)
return Result(
path=path,
policy=camefrom_to_policy(path, camefrom, dss),
policy=camefrom_to_policy(path, camefrom, dsp),
visited=visited,
)

visited.add(s)

for a in shuffled(dss.actions(s)):
ns = dss.next_state(s, a)
for a in shuffled(dsp.actions(s)):
ns = dsp.next_state(s, a)
if ns not in visited and ns not in [el[-1] for el in queue]:
next_cost_from_start = cost_from_start - dss.reward(s, a, ns)
next_cost_from_start = cost_from_start - dsp.reward(s, a, ns)
next_heuristic_cost = next_cost_from_start - self.heuristic_value(ns)
heapq.heappush(queue, ((next_heuristic_cost, next_cost_from_start, rnd.random()), ns))
if self.tie_breaking_strategy in ['lifo', 'fifo']:
tie_break += tie_break_delta
else:
tie_break = rnd.random()
heapq.heappush(queue, ((next_heuristic_cost, tie_break, next_cost_from_start), ns))
camefrom[ns] = (s, a)
120 changes: 95 additions & 25 deletions msdm/tests/test_search.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from msdm.algorithms import BreadthFirstSearch, AStarSearch
from msdm.core.mdp import MarkovDecisionProcess
from msdm.domains import GridWorld
from msdm.domains.gridworld.mdp import GridWorld, TERMINALSTATE
from msdm.tests.domains import DeterministicCounter

gw = GridWorld(
Expand All @@ -16,6 +16,19 @@
discount_rate=1.0
)

empty_gw = GridWorld(
tile_array=[
'....g',
'.....',
'.....',
'.....',
's....',
],
feature_rewards={'g': 0},
step_cost=-1,
discount_rate=1.0
)

def test_bfs():
res = BreadthFirstSearch().plan_on(gw)
assert [(s['x'], s['y']) for s in res.path] == [(0, 2), (1, 2), (2, 2), (-1, -1)]
Expand All @@ -24,28 +37,85 @@ def test_deterministic_shortest_path():
res = BreadthFirstSearch().plan_on(DeterministicCounter(3))
assert res.path == [0, 1, 2, 3]

def make_manhattan_distance_heuristic(mdp : MarkovDecisionProcess):
def manhattan_distance_heuristic(s):
if mdp.is_absorbing(s):
return 0
goal = mdp.absorbing_states[0]
dist = abs(s['x'] - goal['x']) + abs(s['y'] - goal['y'])
return -dist
return manhattan_distance_heuristic

def test_astarsearch():
soln1 = [(0, 2), (0, 3), (0, 4), (1, 4), (2, 4), (2, 3), (2, 2), (-1, -1)]
soln2 = [(0, 2), (0, 1), (0, 0), (1, 0), (2, 0), (2, 1), (2, 2), (-1, -1)]
def make_manhattan_distance_heuristic(mdp : MarkovDecisionProcess):
def manhattan_distance_heuristic(s):
if mdp.is_absorbing(s):
return 0
goal = mdp.absorbing_states[0]
dist = abs(s['x'] - goal['x']) + abs(s['y'] - goal['y'])
return -dist
return manhattan_distance_heuristic
planner = AStarSearch(
heuristic_value=make_manhattan_distance_heuristic(gw),
randomize_action_order=False,
seed=42
)
res = planner.plan_on(gw)
assert [(s['x'], s['y']) for s in res.path] == soln1
planner = AStarSearch(
heuristic_value=make_manhattan_distance_heuristic(gw),
randomize_action_order=True,
seed=42
)
res = planner.plan_on(gw)
assert [(s['x'], s['y']) for s in res.path] == soln2
path_above = ((0, 2), (0, 3), (0, 4), (1, 4), (2, 4), (2, 3), (2, 2), (-1, -1))
path_below = ((0, 2), (0, 1), (0, 0), (1, 0), (2, 0), (2, 1), (2, 2), (-1, -1))

found = set()
for seed in range(10):
planner = AStarSearch(
heuristic_value=make_manhattan_distance_heuristic(gw),
randomize_action_order=False,
seed=seed + 42,
)
res = planner.plan_on(gw)
found.add(tuple([(s['x'], s['y']) for s in res.path]))
# We deterministically select one of the two paths, because we tie-break with LIFO.
# Depends on random seed.
assert found == {path_below}

found = set()
for seed in range(10):
planner = AStarSearch(
heuristic_value=make_manhattan_distance_heuristic(gw),
randomize_action_order=True,
seed=seed + 42
)
res = planner.plan_on(gw)
found.add(tuple([(s['x'], s['y']) for s in res.path]))
# When action order is randomized, each of the optimal solutions are possible.
# Depends on random seed, but likely.
assert found == {path_above, path_below}

def test_astarsearch_tie_breaking():
def _plan(kw):
planner = AStarSearch(
heuristic_value=make_manhattan_distance_heuristic(empty_gw),
randomize_action_order=False,
seed=kw.pop('seed', 42),
**kw,
)
return planner.plan_on(empty_gw)

def _check_path(res):
assert res.path[-1] == TERMINALSTATE
assert len(res.path) - 1 == path_len

path_len = 9
state_count = 25
assert len(set(empty_gw.state_list) - {TERMINALSTATE}) == state_count

# LIFO tie-breaking searches fewer states with equivalent cost + heuristic, because it
# prioritizes more recently added entries to the queue. For the empty grid world we consider here,
# the heuristic has no error, so cost + heuristic at all states is equal to the negative
# value of the start state. LIFO ensures we focus on more recently visited states, which
# means we only have to visit the states we wind up including in our path.
res = _plan(dict(tie_breaking_strategy='lifo'))
assert len(res.visited) == path_len
assert res.visited == set(res.path[:-1])
_check_path(res)

# By contrast, FIFO is more thorough, prioritizing states added earlier. For this domain,
# that means we wind up exploring the entire state space, since they have the same cost + heuristic.
res = _plan(dict(tie_breaking_strategy='fifo'))
assert len(res.visited) == state_count
assert res.visited == set(empty_gw.state_list) - {TERMINALSTATE}
_check_path(res)

# Visits with random tie-breaking will be bounded by the above two extremes.
for seed in range(100):
res = _plan(dict(tie_breaking_strategy='random', seed=seed + 47283674))
# This is guaranteed.
assert path_len <= len(res.visited) <= state_count
# This stricter bound will depend on the random seed, but is likely.
assert path_len < len(res.visited) < state_count
_check_path(res)

0 comments on commit df63214

Please sign in to comment.