Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
sigma = self.compute_greedy(v_init)
new_sigma = np.empty(self.num_states, dtype=int)
for i in range(max_iter):
# Policy evaluation
v_sigma = self.evaluate_policy(sigma)
# Policy improvement
self.compute_greedy(v_sigma, sigma=new_sigma)
if np.array_equal(new_sigma, sigma):
break
sigma[:] = new_sigma
num_iter = i + 1
res = DPSolveResult(v=v_sigma,
sigma=sigma,
num_iter=num_iter,
mc=self.controlled_mc(sigma),
method='policy iteration',
max_iter=max_iter)
return res
tol = np.inf
for i in range(max_iter):
# Policy improvement
self.bellman_operator(v, Tv=u, sigma=sigma)
diff = u - v
if span(diff) < tol:
v[:] = u + midrange(diff) * self.beta / (1 - self.beta)
break
# Partial policy evaluation with k iterations
self.operator_iteration(T=self.T_sigma(sigma), v=u, max_iter=k)
v[:] = u
num_iter = i + 1
res = DPSolveResult(v=v,
sigma=sigma,
num_iter=num_iter,
mc=self.controlled_mc(sigma),
method='modified policy iteration',
epsilon=epsilon,
max_iter=max_iter,
k=k)
return res
v = np.empty(self.num_states)
if v_init is None:
self.s_wise_max(self.R, out=v)
else:
v[:] = v_init
# Storage array for self.bellman_operator
Tv = np.empty(self.num_states)
num_iter = self.operator_iteration(T=self.bellman_operator,
v=v, max_iter=max_iter, tol=tol,
Tv=Tv)
sigma = self.compute_greedy(v)
res = DPSolveResult(v=v,
sigma=sigma,
num_iter=num_iter,
mc=self.controlled_mc(sigma),
method='value iteration',
epsilon=epsilon,
max_iter=max_iter)
return res