{"id":70380,"date":"2026-02-01T22:05:59","date_gmt":"2026-02-01T14:05:59","guid":{"rendered":"https:\/\/www.wsisp.com\/helps\/70380.html"},"modified":"2026-02-01T22:05:59","modified_gmt":"2026-02-01T14:05:59","slug":"%e8%ae%a9-q-%e5%80%bc%e4%bc%b0%e8%ae%a1%e6%9b%b4%e5%87%86%e7%a1%ae%ef%bc%9a%e4%bb%8e-dqn-%e5%88%b0-double-dqn-%e7%9a%84%e6%94%b9%e8%bf%9b%e6%96%b9%e6%a1%88","status":"publish","type":"post","link":"https:\/\/www.wsisp.com\/helps\/70380.html","title":{"rendered":"\u8ba9 Q \u503c\u4f30\u8ba1\u66f4\u51c6\u786e\uff1a\u4ece DQN \u5230 Double DQN \u7684\u6539\u8fdb\u65b9\u6848"},"content":{"rendered":"<p>DQN \u7528<\/p>\n<p>max Q(s&#039;,a&#039;)<\/p>\n<p>\u8ba1\u7b97\u76ee\u6807\u503c&#xff0c;\u7b49\u4e8e\u5728\u6311 Q \u503c\u6700\u9ad8\u7684\u52a8\u4f5c&#xff0c;\u4f46\u662f\u8fd9\u4e9b\u52a8\u4f5c\u4e2d\u5305\u62ec\u4e86\u90a3\u4e9b\u56e0\u4e3a\u4f30\u8ba1\u566a\u58f0\u800c\u88ab\u9ad8\u4f30\u7684\u52a8\u4f5c&#xff0c;\u7d20\u4ee5\u5c31\u4f1a\u4ea7\u751f\u8fc7\u4f30\u8ba1\u504f\u5dee&#xff0c;\u76f4\u63a5\u540e\u679c\u662f\u8bad\u7ec3\u4e0d\u7a33\u5b9a\u3001\u7b56\u7565\u6b21\u4f18\u3002<\/p>\n<p>\u8fd9\u7bc7\u6587\u7ae0\u8981\u89e3\u51b3\u7684\u5c31\u662f\u8fd9\u4e2a\u95ee\u9898&#xff0c;\u5185\u5bb9\u5305\u62ec&#xff1a;DQN \u4e3a\u4ec0\u4e48\u4f1a\u8fc7\u4f30\u8ba1\u3001Double DQN \u600e\u4e48\u628a\u52a8\u4f5c\u9009\u62e9\u548c\u8bc4\u4f30\u62c6\u5f00\u3001Dueling DQN \u600e\u4e48\u5206\u79bb\u72b6\u6001\u503c\u548c\u52a8\u4f5c\u4f18\u52bf\u3001\u4f18\u5148\u7ecf\u9a8c\u56de\u653e\u5982\u4f55\u8ba9\u91c7\u6837\u66f4\u806a\u660e&#xff0c;\u4ee5\u53ca\u7528 PyTorch \u4ece\u5934\u5b9e\u73b0\u8fd9\u4e9b\u6539\u8fdb\u3002\u6700\u540e\u8fd8\u4f1a\u4ecb\u7ecd\u4e00\u4e2a CleanRL \u7684\u4e13\u4e1a\u5b9e\u73b0\u3002 <img decoding=\"async\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2026\/02\/20260201140558-697f5dc698df7.png\" alt=\"\" \/><\/p>\n<h3>\u8fc7\u4f30\u8ba1\u95ee\u9898<\/h3>\n<p>DQN \u7684\u76ee\u6807\u503c\u5982\u4e0b&#xff1a;<\/p>\n<p> y &#061; r &#043; \u03b3\u00b7max\u2090&#039; Q(s&#039;, a&#039;; \u03b8\u207b)<\/p>\n<p>\u95ee\u9898\u5c31\u5728\u4e8e&#xff0c;\u540c\u4e00\u4e2a\u7f51\u7edc\u65e2\u8d1f\u8d23\u9009\u52a8\u4f5c&#xff08;a* &#061; argmax Q&#xff09;&#xff0c;\u53c8\u8d1f\u8d23\u8bc4\u4f30\u8fd9\u4e2a\u52a8\u4f5c\u7684\u4ef7\u503c\u3002Q \u503c\u672c\u8eab\u662f\u5e26\u566a\u58f0\u7684\u4f30\u8ba1\u6240\u4ee5\u6709\u65f6\u5019\u566a\u58f0\u4f1a\u8ba9\u5dee\u52a8\u4f5c\u7684 Q \u503c\u504f\u9ad8&#xff0c;\u53d6 max \u64cd\u4f5c\u5929\u7136\u504f\u5411\u9009\u90a3\u4e9b\u88ab\u9ad8\u4f30\u7684\u52a8\u4f5c\u3002<\/p>\n<p>\u6570\u5b66\u4e0a\u6709\u4e2a\u76f4\u89c2\u7684\u89e3\u91ca&#xff1a;<\/p>\n<p> E[max(X\u2081, X\u2082, &#8230;, X\u2099)] \u2265 max(E[X\u2081], E[X\u2082], &#8230;, E[X\u2099])<\/p>\n<p>\u6700\u5927\u503c\u7684\u671f\u671b\u603b\u662f\u5927\u4e8e\u7b49\u4e8e\u671f\u671b\u7684\u6700\u5927\u503c&#xff0c;\u8fd9\u662f\u51f8\u51fd\u6570\u7684 Jensen \u4e0d\u7b49\u5f0f\u3002<\/p>\n<h4><\/h4>\n<p>\u8fc7\u4f30\u8ba1\u4f1a\u5bfc\u81f4\u6536\u655b\u53d8\u6162&#xff0c;\u667a\u80fd\u4f53\u628a\u65f6\u95f4\u6d6a\u8d39\u5728\u63a2\u7d22\u90a3\u4e9b\u88ab\u9ad8\u4f30\u7684\u52a8\u4f5c\u4e0a\u3002\u5176\u6b21\u662f\u7b56\u7565\u8d28\u91cf\u6253\u6298\u6263&#xff0c;\u9ad8\u566a\u58f0\u7684\u52a8\u4f5c\u53ef\u80fd\u6bd4\u771f\u6b63\u597d\u7684\u52a8\u4f5c\u66f4\u53d7\u9752\u7750\u3002\u66f4\u7cdf\u7684\u662f\u8fc7\u4f30\u8ba1\u4f1a\u4e0d\u65ad\u7d2f\u79ef&#xff0c;\u5bfc\u81f4\u8bad\u7ec3\u53d1\u6563\u3002\u6cdb\u5316\u80fd\u529b\u4e5f\u4f1a\u53d7\u635f\u2014\u2014\u5728\u72b6\u6001\u7a7a\u95f4\u7684\u566a\u58f0\u533a\u57df&#xff0c;\u667a\u80fd\u4f53\u4f1a\u8868\u73b0\u5f97\u8fc7\u4e8e\u81ea\u4fe1\u3002<\/p>\n<h3>Double DQN&#xff1a;\u628a\u9009\u62e9\u548c\u8bc4\u4f30\u62c6\u5f00<\/h3>\n<p>\u6807\u51c6 DQN \u4e00\u4e2a\u7f51\u7edc\u5e72\u4e24\u4ef6\u4e8b&#xff1a;<\/p>\n<p> a* &#061; argmax\u2090&#039; Q(s&#039;, a&#039;; \u03b8\u207b)  # \u9009\u6700\u4f73\u52a8\u4f5c<br \/>\n y &#061; r &#043; \u03b3 \u00b7 Q(s&#039;, a*; \u03b8\u207b)    # \u8bc4\u4f30\u8fd9\u4e2a\u52a8\u4f5c&#xff08;\u540c\u4e00\u4e2a\u7f51\u7edc&#xff09;<\/p>\n<p>Double DQN \u7528\u4e24\u4e2a\u7f51\u7edc&#xff0c;\u5404\u7ba1\u4e00\u4ef6&#xff1a;<\/p>\n<p> a* &#061; argmax\u2090&#039; Q(s&#039;, a&#039;; \u03b8)  # \u7528\u5f53\u524d\u7f51\u7edc\u9009<br \/>\n y &#061; r &#043; \u03b3 \u00b7 Q(s&#039;, a*; \u03b8\u207b)   # \u7528\u76ee\u6807\u7f51\u7edc\u8bc4\u4f30<\/p>\n<p>\u5f53\u524d\u7f51\u7edc&#xff08;\u03b8&#xff09;\u9009\u52a8\u4f5c&#xff0c;\u76ee\u6807\u7f51\u7edc&#xff08;\u03b8\u207b&#xff09;\u8bc4\u4f30\u3002\u4e24\u4e2a\u7f51\u7edc\u7684\u8bef\u5dee\u4e0d\u76f8\u5173\u8fd9\u6837\u6700\u5927\u5316\u504f\u5dee\u5c31\u88ab\u6253\u7834\u4e86\u3002<\/p>\n<p>\u4e3a\u4ec0\u4e48\u6709\u6548\u5462&#xff1f;<\/p>\n<p>\u5047\u8bbe\u5f53\u524d\u7f51\u7edc\u628a\u52a8\u4f5c a \u7684\u4ef7\u503c\u4f30\u9ad8\u4e86&#xff0c;\u76ee\u6807\u7f51\u7edc&#xff08;\u53c2\u6570\u4e0d\u540c&#xff09;\u5927\u6982\u7387\u4e0d\u4f1a\u72af\u540c\u6837\u7684\u9519\u3002\u8bef\u5dee\u76f8\u4e92\u72ec\u7acb&#xff0c;\u503e\u5411\u4e8e\u62b5\u6d88\u800c\u975e\u7d2f\u52a0\u3002<\/p>\n<p>\u6700\u901a\u4fd7\u7684\u89e3\u91ca\u5c31\u662fDQN \u50cf\u662f\u81ea\u5df1\u7ed9\u83dc\u6253\u5206\u3001\u81ea\u5df1\u6311\u83dc\u5403&#xff0c;\u8fd9\u6837\u70c2\u83dc\u53ef\u80fd\u5c31\u6df7\u8fdb\u6765\u4e86&#xff0c;\u800cDouble DQN \u8ba9\u670b\u53cb\u6253\u5206\u3001\u4f60\u6765\u6311&#xff0c;\u4e24\u8fb9\u7684\u8bef\u5dee\u5bf9\u51b2\u6389\u4e86\u3002<\/p>\n<p>  Standard DQN:  E[Q(s, argmax\u2090 Q(s,a))] \u2265 max\u2090 E[Q(s,a)]   &#xff08;\u6709\u504f&#xff09;<br \/>\n Double DQN:    E[Q\u2082(s, argmax\u2090 Q\u2081(s,a))] \u2248 max\u2090 E[Q(s,a)]  &#xff08;\u65e0\u504f&#xff09;<\/p>\n<p>\u4ece DQN \u5230 Double DQN&#xff0c;\u53ea\u9700\u8981\u6539\u4e00\u884c&#xff1a;<\/p>\n<p> # DQN \u76ee\u6807<br \/>\nnext_q_values&#061;target_network(next_states).max(1)[0]<br \/>\ntarget&#061;rewards&#043;gamma*next_q_values* (1-dones)  <\/p>\n<p># Double DQN \u76ee\u6807<br \/>\nnext_actions&#061;current_network(next_states).argmax(1)  # &lt;- \u7528\u5f53\u524d\u7f51\u7edc\u9009<br \/>\nnext_q_values&#061;target_network(next_states).gather(1, next_actions.unsqueeze(1))  # &lt;- \u7528\u76ee\u6807\u7f51\u7edc\u8bc4\u4f30<br \/>\n target&#061;rewards&#043;gamma*next_q_values.squeeze() * (1-dones)<\/p>\n<p>\u5c31\u8fd9\u4e00\u884c\u6539\u52a8\u6781\u5c0f&#xff0c;\u6548\u679c\u5374\u5f88\u660e\u663e\u3002<\/p>\n<h3>\u5b9e\u73b0&#xff1a;Double DQN<\/h3>\n<p>\u6269\u5c55 DQN Agent<\/p>\n<p> classDoubleDQNAgent(DQNAgent):<br \/>\n    &#034;&#034;&#034;<br \/>\n    Double DQN: \u901a\u8fc7\u89e3\u8026\u52a8\u4f5c\u9009\u62e9\u548c\u8bc4\u4f30\u6765\u51cf\u5c11\u8fc7\u4f30\u8ba1\u504f\u5dee\u3002<br \/>\n    &#034;&#034;&#034;  <\/p>\n<p>    def__init__(self, *args, **kwargs):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u521d\u59cb\u5316 Double DQN agent\u3002<br \/>\n        \u4ece DQN \u7ee7\u627f\u6240\u6709\u5185\u5bb9&#xff0c;\u53ea\u6539\u53d8\u76ee\u6807\u8ba1\u7b97\u3002<br \/>\n        &#034;&#034;&#034;<br \/>\n        super().__init__(*args, **kwargs)  <\/p>\n<p>    defupdate(self) -&gt;Dict[str, float]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u6267\u884c Double DQN \u66f4\u65b0\u3002  <\/p>\n<p>        Returns:<br \/>\n            metrics: \u8bad\u7ec3\u6307\u6807<br \/>\n        &#034;&#034;&#034;<br \/>\n        iflen(self.replay_buffer) &lt;self.batch_size:<br \/>\n            return {}  <\/p>\n<p>        # \u91c7\u6837\u6279\u6b21<br \/>\n        states, actions, rewards, next_states, dones&#061;self.replay_buffer.sample(<br \/>\n            self.batch_size<br \/>\n        )  <\/p>\n<p>        states&#061;states.to(self.device)<br \/>\n        actions&#061;actions.to(self.device)<br \/>\n        rewards&#061;rewards.to(self.device)<br \/>\n        next_states&#061;next_states.to(self.device)<br \/>\n        dones&#061;dones.to(self.device)  <\/p>\n<p>        # \u5f53\u524d Q \u503c Q(s,a;\u03b8)<br \/>\n        current_q_values&#061;self.q_network(states).gather(1, actions.unsqueeze(1))  <\/p>\n<p>        # Double DQN \u76ee\u6807\u8ba1\u7b97<br \/>\n        withtorch.no_grad():<br \/>\n            # \u4f7f\u7528\u5f53\u524d\u7f51\u7edc\u9009\u62e9\u52a8\u4f5c<br \/>\n            next_actions&#061;self.q_network(next_states).argmax(1)  <\/p>\n<p>            # \u4f7f\u7528\u76ee\u6807\u7f51\u7edc\u8bc4\u4f30\u52a8\u4f5c<br \/>\n            next_q_values&#061;self.target_network(next_states).gather(<br \/>\n                1, next_actions.unsqueeze(1)<br \/>\n            ).squeeze()  <\/p>\n<p>            # \u8ba1\u7b97\u76ee\u6807<br \/>\n            target_q_values&#061;rewards&#043; (1-dones) *self.gamma*next_q_values  <\/p>\n<p>        # \u8ba1\u7b97\u635f\u5931<br \/>\n        loss&#061;F.mse_loss(current_q_values.squeeze(), target_q_values)  <\/p>\n<p>        # \u68af\u5ea6\u4e0b\u964d<br \/>\n        self.optimizer.zero_grad()<br \/>\n        loss.backward()<br \/>\n        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), max_norm&#061;10.0)<br \/>\n        self.optimizer.step()  <\/p>\n<p>        self.training_step&#043;&#061;1  <\/p>\n<p>        return {<br \/>\n            &#039;loss&#039;: loss.item(),<br \/>\n            &#039;q_mean&#039;: current_q_values.mean().item(),<br \/>\n            &#039;q_std&#039;: current_q_values.std().item(),<br \/>\n            &#039;target_q_mean&#039;: target_q_values.mean().item()<br \/>\n         }<\/p>\n<p>\u8bad\u7ec3\u51fd\u6570&#xff1a;<\/p>\n<p> deftrain_double_dqn(<br \/>\n    env_name: str,<br \/>\n    n_episodes: int&#061;1000,<br \/>\n    max_steps: int&#061;500,<br \/>\n    train_freq: int&#061;1,<br \/>\n    eval_frequency: int&#061;50,<br \/>\n    eval_episodes: int&#061;10,<br \/>\n    verbose: bool&#061;True,<br \/>\n    **kwargs<br \/>\n) -&gt;Tuple:<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u8bad\u7ec3 Double DQN agent&#xff08;\u4f7f\u7528 DoubleDQNAgent \u800c\u4e0d\u662f DQNAgent&#xff09;\u3002<br \/>\n    &#034;&#034;&#034;<br \/>\n    # \u4e0e train_dqn \u76f8\u540c\u4f46\u4f7f\u7528 DoubleDQNAgent<br \/>\n    env&#061;gym.make(env_name)<br \/>\n    eval_env&#061;gym.make(env_name)  <\/p>\n<p>    state_dim&#061;env.observation_space.shape[0]<br \/>\n    action_dim&#061;env.action_space.n  <\/p>\n<p>    # \u4f7f\u7528 DoubleDQNAgent<br \/>\n    agent&#061;DoubleDQNAgent(<br \/>\n        state_dim&#061;state_dim,<br \/>\n        action_dim&#061;action_dim,<br \/>\n        **kwargs<br \/>\n    )  <\/p>\n<p>    # \u8bad\u7ec3\u5faa\u73af&#xff08;\u4e0e DQN \u76f8\u540c&#xff09;<br \/>\n    stats&#061; {<br \/>\n        &#039;episode_rewards&#039;: [],<br \/>\n        &#039;episode_lengths&#039;: [],<br \/>\n        &#039;losses&#039;: [],<br \/>\n        &#039;q_values&#039;: [],<br \/>\n        &#039;target_q_values&#039;: [],<br \/>\n        &#039;eval_rewards&#039;: [],<br \/>\n        &#039;eval_episodes&#039;: [],<br \/>\n        &#039;epsilons&#039;: []<br \/>\n    }  <\/p>\n<p>    print(f&#034;Training Double DQN on {env_name}&#034;)<br \/>\n    print(f&#034;State dim: {state_dim}, Action dim: {action_dim}&#034;)<br \/>\n    print(&#034;&#061;&#034;*70)  <\/p>\n<p>    forepisodeinrange(n_episodes):<br \/>\n        state, _&#061;env.reset()<br \/>\n        episode_reward&#061;0<br \/>\n        episode_length&#061;0<br \/>\n        episode_metrics&#061; []  <\/p>\n<p>        forstepinrange(max_steps):<br \/>\n            action&#061;agent.select_action(state, training&#061;True)<br \/>\n            next_state, reward, terminated, truncated, _&#061;env.step(action)<br \/>\n            done&#061;terminatedortruncated  <\/p>\n<p>            agent.store_transition(state, action, reward, next_state, done)  <\/p>\n<p>            ifstep%train_freq&#061;&#061;0:<br \/>\n                metrics&#061;agent.update()<br \/>\n                ifmetrics:<br \/>\n                    episode_metrics.append(metrics)  <\/p>\n<p>            episode_reward&#043;&#061;reward<br \/>\n            episode_length&#043;&#061;1<br \/>\n            state&#061;next_state  <\/p>\n<p>            ifdone:<br \/>\n                break  <\/p>\n<p>        # \u66f4\u65b0\u76ee\u6807\u7f51\u7edc<br \/>\n        if (episode&#043;1) %kwargs.get(&#039;target_update_freq&#039;, 10) &#061;&#061;0:<br \/>\n            agent.update_target_network()  <\/p>\n<p>        agent.decay_epsilon()  <\/p>\n<p>        # \u5b58\u50a8\u7edf\u8ba1\u4fe1\u606f<br \/>\n        stats[&#039;episode_rewards&#039;].append(episode_reward)<br \/>\n        stats[&#039;episode_lengths&#039;].append(episode_length)<br \/>\n        stats[&#039;epsilons&#039;].append(agent.epsilon)  <\/p>\n<p>        ifepisode_metrics:<br \/>\n            stats[&#039;losses&#039;].append(np.mean([m[&#039;loss&#039;] forminepisode_metrics]))<br \/>\n            stats[&#039;q_values&#039;].append(np.mean([m[&#039;q_mean&#039;] forminepisode_metrics]))<br \/>\n            stats[&#039;target_q_values&#039;].append(np.mean([m[&#039;target_q_mean&#039;] forminepisode_metrics]))  <\/p>\n<p>        # \u8bc4\u4f30<br \/>\n        if (episode&#043;1) %eval_frequency&#061;&#061;0:<br \/>\n            eval_reward&#061;evaluate_dqn(eval_env, agent, eval_episodes)<br \/>\n            stats[&#039;eval_rewards&#039;].append(eval_reward)<br \/>\n            stats[&#039;eval_episodes&#039;].append(episode&#043;1)  <\/p>\n<p>            ifverbose:<br \/>\n                avg_reward&#061;np.mean(stats[&#039;episode_rewards&#039;][-50:])<br \/>\n                avg_loss&#061;np.mean(stats[&#039;losses&#039;][-50:]) ifstats[&#039;losses&#039;] else0<br \/>\n                avg_q&#061;np.mean(stats[&#039;q_values&#039;][-50:]) ifstats[&#039;q_values&#039;] else0  <\/p>\n<p>                print(f&#034;Episode {episode&#043;1:4d} | &#034;<br \/>\n                      f&#034;Reward: {avg_reward:7.2f} | &#034;<br \/>\n                      f&#034;Eval: {eval_reward:7.2f} | &#034;<br \/>\n                      f&#034;Loss: {avg_loss:7.4f} | &#034;<br \/>\n                      f&#034;Q: {avg_q:6.2f} | &#034;<br \/>\n                      f&#034;\u03b5: {agent.epsilon:.3f}&#034;)  <\/p>\n<p>    env.close()<br \/>\n    eval_env.close()  <\/p>\n<p>    print(&#034;&#061;&#034;*70)<br \/>\n    print(&#034;Training complete!&#034;)  <\/p>\n<p>     returnagent, stats<\/p>\n<p>LunarLander-v3<\/p>\n<p> # \u8bad\u7ec3 Double DQN<br \/>\nif__name__&#061;&#061;&#034;__main__&#034;:<br \/>\n    device&#061;&#039;cuda&#039;iftorch.cuda.is_available() else&#039;cpu&#039;  <\/p>\n<p>    agent_ddqn, stats_ddqn&#061;train_double_dqn(<br \/>\n        env_name&#061;&#039;LunarLander-v3&#039;,<br \/>\n        n_episodes&#061;4000,<br \/>\n        max_steps&#061;1000,<br \/>\n        learning_rate&#061;5e-4,<br \/>\n        gamma&#061;0.99,<br \/>\n        epsilon_start&#061;1.0,<br \/>\n        epsilon_end&#061;0.01,<br \/>\n        epsilon_decay&#061;0.9995,<br \/>\n        buffer_capacity&#061;100000,<br \/>\n        batch_size&#061;128,<br \/>\n        target_update_freq&#061;20,<br \/>\n        train_freq&#061;4,<br \/>\n        eval_frequency&#061;100,<br \/>\n        eval_episodes&#061;10,<br \/>\n        hidden_dims&#061;[256, 256],<br \/>\n        device&#061;device,<br \/>\n        verbose&#061;True<br \/>\n    )  <\/p>\n<p>    # \u4fdd\u5b58\u6a21\u578b<br \/>\n     agent_ddqn.save(&#039;doubledqn_lunar_lander.pth&#039;)<\/p>\n<p>\u8f93\u51fa&#xff1a;<\/p>\n<p>  Training Double DQN on LunarLander-v3<br \/>\nState dim: 8, Action dim: 4<br \/>\n&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;<br \/>\nEpisode  100 | Reward: -155.24 | Eval: -885.72 | Loss: 52.9057 | Q:   0.20 | \u03b5: 0.951<br \/>\nEpisode  200 | Reward: -148.85 | Eval:  -85.94 | Loss: 37.2449 | Q:   2.14 | \u03b5: 0.905<br \/>\nEpisode  300 | Reward: -111.61 | Eval: -172.48 | Loss: 37.4279 | Q:   3.52 | \u03b5: 0.861<br \/>\nEpisode  400 | Reward:  -99.21 | Eval: -198.43 | Loss: 41.5296 | Q:   8.15 | \u03b5: 0.819<br \/>\nEpisode  500 | Reward:  -80.75 | Eval: -103.26 | Loss: 56.2701 | Q:  11.70 | \u03b5: 0.779<br \/>\n&#8230;<br \/>\nEpisode 3200 | Reward:  102.04 | Eval:  159.71 | Loss: 16.5263 | Q:  27.94 | \u03b5: 0.202<br \/>\nEpisode 3300 | Reward:  140.37 | Eval:  191.79 | Loss: 22.5564 | Q:  29.81 | \u03b5: 0.192<br \/>\nEpisode 3400 | Reward:  114.08 | Eval:  269.40 | Loss: 23.2846 | Q:  32.40 | \u03b5: 0.183<br \/>\nEpisode 3500 | Reward:  166.33 | Eval:  244.32 | Loss: 21.8558 | Q:  32.51 | \u03b5: 0.174<br \/>\nEpisode 3600 | Reward:  150.80 | Eval:  265.42 | Loss: 21.6430 | Q:  33.18 | \u03b5: 0.165<br \/>\nEpisode 3700 | Reward:  148.59 | Eval:  239.56 | Loss: 23.8328 | Q:  34.65 | \u03b5: 0.157<br \/>\nEpisode 3800 | Reward:  162.82 | Eval:  233.36 | Loss: 28.3445 | Q:  37.46 | \u03b5: 0.149<br \/>\nEpisode 3900 | Reward:  177.70 | Eval:  259.99 | Loss: 36.2971 | Q:  40.22 | \u03b5: 0.142<br \/>\nEpisode 4000 | Reward:  156.60 | Eval:  251.17 | Loss: 46.7266 | Q:  42.15 | \u03b5: 0.135<br \/>\n&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;&#061;<br \/>\n Training complete!<\/p>\n<h3>Dueling DQN&#xff1a;\u5206\u79bb\u503c\u548c\u4f18\u52bf<\/h3>\n<p>\u5f88\u591a\u72b6\u6001\u4e0b&#xff0c;\u9009\u54ea\u4e2a\u52a8\u4f5c\u5176\u5b9e\u5dee\u522b\u4e0d\u5927\u3002CartPole \u91cc\u6746\u5b50\u521a\u597d\u5e73\u8861\u65f6&#xff0c;\u5411\u5de6\u5411\u53f3\u90fd\u884c&#xff1b;\u5f00\u8f66\u8d70\u76f4\u7ebf\u65b9\u5411\u76d8\u5fae\u8c03\u7684\u7ed3\u679c\u5dee\u4e0d\u591a&#xff1b;LunarLander \u79bb\u5730\u9762\u8fd8\u8fdc\u7684\u65f6\u5019&#xff0c;\u5f15\u64ce\u600e\u4e48\u55b7\u5f71\u54cd\u4e5f\u6709\u9650\u3002<\/p>\n<p>\u6807\u51c6 DQN \u5bf9\u6bcf\u4e2a\u52a8\u4f5c\u5355\u72ec\u5b66 Q(s,a)&#xff0c;\u628a\u7f51\u7edc\u5bb9\u91cf\u6d6a\u8d39\u5728\u5197\u4f59\u4fe1\u606f\u4e0a\u3002Dueling DQN \u7684\u601d\u8def\u662f\u628a Q \u62c6\u6210\u4e24\u90e8\u5206&#xff1a;V(s) \u8868\u793a&#034;\u8fd9\u4e2a\u72b6\u6001\u672c\u8eab\u503c\u591a\u5c11&#034;&#xff0c;A(s,a) \u8868\u793a&#034;\u8fd9\u4e2a\u52a8\u4f5c\u6bd4\u5e73\u5747\u6c34\u5e73\u597d\u591a\u5c11&#034;\u3002<\/p>\n<p>\u67b6\u6784\u5982\u4e0b<\/p>\n<p> \u6807\u51c6 DQN:<br \/>\n Input -&gt; Hidden Layers -&gt; Q(s,a\u2081), Q(s,a\u2082), &#8230;, Q(s,a\u2099)  <\/p>\n<p>Dueling DQN:<br \/>\n                       |-&gt; Value Stream -&gt; V(s)<br \/>\nInput -&gt; Shared Layers |<br \/>\n                       |-&gt; Advantage Stream -&gt; A(s,a\u2081), A(s,a\u2082), &#8230;, A(s,a\u2099)  <\/p>\n<p> Q(s,a) &#061; V(s) &#043; (A(s,a) &#8211; mean(A(s,\u00b7)))<\/p>\n<p>\u4e3a\u4ec0\u4e48\u8981\u51cf\u53bb\u5747\u503c&#xff1f;\u4e0d\u51cf\u7684\u8bdd&#xff0c;\u4efb\u4f55\u5e38\u6570\u52a0\u5230 V \u518d\u4ece A \u51cf\u6389&#xff0c;\u5f97\u5230\u7684 Q \u5b8c\u5168\u4e00\u6837&#xff0c;\u7f51\u7edc\u5b66\u4e0d\u51fa\u552f\u4e00\u89e3\u3002<\/p>\n<p>\u6570\u5b66\u8868\u8fbe\u5982\u4e0b&#xff1a;<\/p>\n<p> Q(s,a) &#061; V(s) &#043; A(s,a) &#8211; (1\/|A|)\u00b7\u03a3\u2090&#039; A(s,a&#039;)<\/p>\n<p>\u4e5f\u53ef\u4ee5\u7528 max \u4ee3\u66ff mean&#xff1a;<\/p>\n<p> Q(s,a) &#061; V(s) &#043; A(s,a) &#8211; max\u2090&#039; A(s,a&#039;)<\/p>\n<p>\u5b9e\u8df5\u4e2d max \u7248\u672c\u6709\u65f6\u6548\u679c\u66f4\u597d\u3002<\/p>\n<p>\u4e3e\u4e2a\u4f8b\u5b50&#xff1a;V(s) &#061; 10&#xff0c;\u597d\u52a8\u4f5c\u7684 A \u662f &#043;5&#xff0c;\u5dee\u52a8\u4f5c\u7684 A \u662f -3&#xff0c;\u5e73\u5747\u4f18\u52bf &#061; (&#043;5-3)\/2 &#061; &#043;1\u3002\u90a3\u4e48 Q(s, \u597d\u52a8\u4f5c) &#061; 10 &#043; 5 &#8211; 1 &#061; 14&#xff0c;Q(s, \u5dee\u52a8\u4f5c) &#061; 10 &#8211; 3 &#8211; 1 &#061; 6\u3002<\/p>\n<p>\u5b9e\u73b0<\/p>\n<p> classDuelingQNetwork(nn.Module):<br \/>\n    &#034;&#034;&#034;<br \/>\n    Dueling DQN \u67b6\u6784&#xff0c;\u5206\u79bb\u503c\u548c\u4f18\u52bf\u3002  <\/p>\n<p>    \u7406\u8bba: Q(s,a) &#061; V(s) &#043; A(s,a) &#8211; mean(A(s,\u00b7))<br \/>\n    &#034;&#034;&#034;  <\/p>\n<p>    def__init__(<br \/>\n        self,<br \/>\n        state_dim: int,<br \/>\n        action_dim: int,<br \/>\n        hidden_dims: List[int] &#061; [128, 128]<br \/>\n    ):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u521d\u59cb\u5316 Dueling Q \u7f51\u7edc\u3002  <\/p>\n<p>        Args:<br \/>\n            state_dim: \u72b6\u6001\u7a7a\u95f4\u7ef4\u5ea6<br \/>\n            action_dim: \u52a8\u4f5c\u6570\u91cf<br \/>\n            hidden_dims: \u5171\u4eab\u5c42\u5927\u5c0f<br \/>\n        &#034;&#034;&#034;<br \/>\n        super(DuelingQNetwork, self).__init__()  <\/p>\n<p>        self.state_dim&#061;state_dim<br \/>\n        self.action_dim&#061;action_dim  <\/p>\n<p>        # \u5171\u4eab\u7279\u5f81\u63d0\u53d6\u5668<br \/>\n        shared_layers&#061; []<br \/>\n        input_dim&#061;state_dim  <\/p>\n<p>        forhidden_diminhidden_dims:<br \/>\n            shared_layers.append(nn.Linear(input_dim, hidden_dim))<br \/>\n            shared_layers.append(nn.ReLU())<br \/>\n            input_dim&#061;hidden_dim  <\/p>\n<p>        self.shared_network&#061;nn.Sequential(*shared_layers)  <\/p>\n<p>        # \u503c\u6d41: V(s) &#061; \u72b6\u6001\u7684\u6807\u91cf\u503c<br \/>\n        self.value_stream&#061;nn.Sequential(<br \/>\n            nn.Linear(hidden_dims[-1], 128),<br \/>\n            nn.ReLU(),<br \/>\n            nn.Linear(128, 1)<br \/>\n        )  <\/p>\n<p>        # \u4f18\u52bf\u6d41: A(s,a) &#061; \u6bcf\u4e2a\u52a8\u4f5c\u7684\u4f18\u52bf<br \/>\n        self.advantage_stream&#061;nn.Sequential(<br \/>\n            nn.Linear(hidden_dims[-1], 128),<br \/>\n            nn.ReLU(),<br \/>\n            nn.Linear(128, action_dim)<br \/>\n        )  <\/p>\n<p>        # \u521d\u59cb\u5316\u6743\u91cd<br \/>\n        self.apply(self._init_weights)  <\/p>\n<p>    def_init_weights(self, module):<br \/>\n        &#034;&#034;&#034;\u521d\u59cb\u5316\u7f51\u7edc\u6743\u91cd\u3002&#034;&#034;&#034;<br \/>\n        ifisinstance(module, nn.Linear):<br \/>\n            nn.init.kaiming_normal_(module.weight, nonlinearity&#061;&#039;relu&#039;)<br \/>\n            nn.init.constant_(module.bias, 0.0)  <\/p>\n<p>    defforward(self, state: torch.Tensor) -&gt;torch.Tensor:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u901a\u8fc7 dueling \u67b6\u6784\u7684\u524d\u5411\u4f20\u64ad\u3002  <\/p>\n<p>        Args:<br \/>\n            state: \u72b6\u6001\u6279\u6b21, \u5f62\u72b6 (batch_size, state_dim)  <\/p>\n<p>        Returns:<br \/>\n            q_values: \u6240\u6709\u52a8\u4f5c\u7684 Q(s,a), \u5f62\u72b6 (batch_size, action_dim)<br \/>\n        &#034;&#034;&#034;<br \/>\n        # \u5171\u4eab\u7279\u5f81<br \/>\n        features&#061;self.shared_network(state)  <\/p>\n<p>        # \u503c: V(s) -&gt; \u5f62\u72b6 (batch_size, 1)<br \/>\n        value&#061;self.value_stream(features)  <\/p>\n<p>        # \u4f18\u52bf: A(s,a) -&gt; \u5f62\u72b6 (batch_size, action_dim)<br \/>\n        advantages&#061;self.advantage_stream(features)  <\/p>\n<p>        # \u7ec4\u5408: Q(s,a) &#061; V(s) &#043; A(s,a) &#8211; mean(A(s,\u00b7))<br \/>\n        q_values&#061;value&#043;advantages-advantages.mean(dim&#061;1, keepdim&#061;True)  <\/p>\n<p>        returnq_values  <\/p>\n<p>    defget_action(self, state: np.ndarray, epsilon: float&#061;0.0) -&gt;int:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u4f7f\u7528 \u03b5-greedy \u7b56\u7565\u9009\u62e9\u52a8\u4f5c\u3002<br \/>\n        &#034;&#034;&#034;<br \/>\n        ifrandom.random() &lt;epsilon:<br \/>\n            returnrandom.randint(0, self.action_dim-1)<br \/>\n        else:<br \/>\n            withtorch.no_grad():<br \/>\n                state_tensor&#061;torch.FloatTensor(state).unsqueeze(0).to(<br \/>\n                    next(self.parameters()).device<br \/>\n                )<br \/>\n                q_values&#061;self.forward(state_tensor)<br \/>\n                 returnq_values.argmax(dim&#061;1).item()<\/p>\n<p>Dueling \u67b6\u6784\u7684\u597d\u5904&#xff1a;\u5728\u52a8\u4f5c\u5f71\u54cd\u4e0d\u5927\u7684\u72b6\u6001\u4e0b\u5b66\u5f97\u66f4\u597d&#xff0c;\u68af\u5ea6\u6d41\u52a8\u66f4\u901a\u7545\u6240\u4ee5\u6536\u655b\u66f4\u5feb&#xff0c;\u503c\u4f30\u8ba1\u4e5f\u66f4\u7a33\u5065\u3002<\/p>\n<p>\u8fd8\u53ef\u4ee5\u628a\u4e24\u79cd\u6539\u8fdb\u53e0\u5728\u4e00\u8d77&#xff0c;\u505a\u6210Double Dueling DQN<\/p>\n<p> classDoubleDuelingDQNAgent(DoubleDQNAgent):<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u7ed3\u5408 Double DQN \u548c Dueling DQN \u7684\u667a\u80fd\u4f53\u3002<br \/>\n    &#034;&#034;&#034;  <\/p>\n<p>    def__init__(<br \/>\n        self,<br \/>\n        state_dim: int,<br \/>\n        action_dim: int,<br \/>\n        hidden_dims: List[int] &#061; [128, 128],<br \/>\n        **kwargs<br \/>\n    ):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u521d\u59cb\u5316 Double Dueling DQN \u667a\u80fd\u4f53\u3002<br \/>\n        \u4f7f\u7528 DuelingQNetwork \u800c\u4e0d\u662f\u6807\u51c6 QNetwork\u3002<br \/>\n        &#034;&#034;&#034;<br \/>\n        # \u6682\u4e0d\u8c03\u7528 super().__init__()<br \/>\n        # \u6211\u4eec\u9700\u8981\u4ee5\u4e0d\u540c\u65b9\u5f0f\u8bbe\u7f6e\u7f51\u7edc  <\/p>\n<p>        self.state_dim&#061;state_dim<br \/>\n        self.action_dim&#061;action_dim<br \/>\n        self.gamma&#061;kwargs.get(&#039;gamma&#039;, 0.99)<br \/>\n        self.batch_size&#061;kwargs.get(&#039;batch_size&#039;, 64)<br \/>\n        self.target_update_freq&#061;kwargs.get(&#039;target_update_freq&#039;, 10)<br \/>\n        self.device&#061;torch.device(kwargs.get(&#039;device&#039;, &#039;cpu&#039;))  <\/p>\n<p>        # \u63a2\u7d22<br \/>\n        self.epsilon&#061;kwargs.get(&#039;epsilon_start&#039;, 1.0)<br \/>\n        self.epsilon_end&#061;kwargs.get(&#039;epsilon_end&#039;, 0.01)<br \/>\n        self.epsilon_decay&#061;kwargs.get(&#039;epsilon_decay&#039;, 0.995)  <\/p>\n<p>        # \u4f7f\u7528 Dueling \u67b6\u6784<br \/>\n        self.q_network&#061;DuelingQNetwork(<br \/>\n            state_dim, action_dim, hidden_dims<br \/>\n        ).to(self.device)  <\/p>\n<p>        self.target_network&#061;DuelingQNetwork(<br \/>\n            state_dim, action_dim, hidden_dims<br \/>\n        ).to(self.device)  <\/p>\n<p>        self.target_network.load_state_dict(self.q_network.state_dict())<br \/>\n        self.target_network.eval()  <\/p>\n<p>        # \u4f18\u5316\u5668<br \/>\n        learning_rate&#061;kwargs.get(&#039;learning_rate&#039;, 1e-3)<br \/>\n        self.optimizer&#061;torch.optim.Adam(self.q_network.parameters(), lr&#061;learning_rate)  <\/p>\n<p>        # \u56de\u653e\u7f13\u51b2\u533a<br \/>\n        buffer_capacity&#061;kwargs.get(&#039;buffer_capacity&#039;, 100000)<br \/>\n        self.replay_buffer&#061;ReplayBuffer(buffer_capacity)  <\/p>\n<p>        # \u7edf\u8ba1<br \/>\n        self.episode_count&#061;0<br \/>\n        self.training_step&#061;0  <\/p>\n<p>     # update() \u65b9\u6cd5\u7ee7\u627f\u81ea DoubleDQNAgent<\/p>\n<h3>\u4f18\u5148\u7ecf\u9a8c\u56de\u653e<\/h3>\n<p>\u4e0d\u662f\u6240\u6709\u7ecf\u9a8c\u90fd\u540c\u7b49\u6709\u4ef7\u503c\u3002TD \u8bef\u5dee\u5927\u7684\u8f6c\u6362\u8bf4\u660e\u9884\u6d4b\u504f\u79bb\u73b0\u5b9e&#xff0c;\u80fd\u5b66\u5230\u4e1c\u897f&#xff1b;TD \u8bef\u5dee\u5c0f\u7684\u8f6c\u6362\u8bf4\u660e\u5df2\u7ecf\u5b66\u5f97\u5dee\u4e0d\u591a\u4e86\u518d\u91c7\u5230\u4e5f\u6ca1\u591a\u5927\u7528\u3002<\/p>\n<p>\u5747\u5300\u91c7\u6837\u628a\u6240\u6709\u8f6c\u6362\u4e00\u89c6\u540c\u4ec1&#xff0c;\u6d6a\u8d39\u4e86\u5b66\u4e60\u673a\u4f1a\u3002\u4f18\u5148\u7ecf\u9a8c\u56de\u653e\u7684\u601d\u8def\u662f&#xff1a;\u8ba9\u91cd\u8981\u7684\u8f6c\u6362\u88ab\u91c7\u5230\u7684\u6982\u7387\u66f4\u9ad8\u3002<\/p>\n<p>\u4f18\u5148\u7ea7\u600e\u4e48\u7b97<\/p>\n<p> p\u1d62 &#061; |\u03b4\u1d62| &#043; \u03b5  <\/p>\n<p> \u5176\u4e2d:<br \/>\n \u03b4\u1d62 &#061; r &#043; \u03b3\u00b7max Q(s&#039;,a&#039;) &#8211; Q(s,a)   &#xff08;TD \u8bef\u5dee&#xff09;<br \/>\n \u03b5 &#061; \u5c0f\u5e38\u6570&#xff0c;\u4fdd\u8bc1\u6240\u6709\u8f6c\u6362\u90fd\u6709\u88ab\u91c7\u5230\u7684\u53ef\u80fd<\/p>\n<p>\u91c7\u6837\u6982\u7387&#xff1a;<\/p>\n<p>  P(i) &#061; p\u1d62^\u03b1 \/ \u03a3\u2c7c p\u2c7c^\u03b1  <\/p>\n<p> \u03b1 \u63a7\u5236\u4f18\u5148\u5316\u7a0b\u5ea6:<br \/>\n \u03b1 &#061; 0 -&gt; \u9000\u5316\u6210\u5747\u5300\u91c7\u6837<br \/>\n \u03b1 &#061; 1 -&gt; \u5b8c\u5168\u6309\u4f18\u5148\u7ea7\u6bd4\u4f8b\u91c7\u6837<\/p>\n<p>\u4f18\u5148\u91c7\u6837\u6539\u4e86\u6570\u636e\u5206\u5e03&#xff0c;\u4f1a\u5f15\u5165\u504f\u5dee\u3002\u6240\u4ee5\u89e3\u51b3\u529e\u6cd5\u662f\u7528\u91cd\u8981\u6027\u91c7\u6837\u6bd4\u7387\u6765\u52a0\u6743\u66f4\u65b0&#xff1a;<\/p>\n<p> w\u1d62 &#061; (N \u00b7 P(i))^(-\u03b2)  <\/p>\n<p> \u03b2 \u63a7\u5236\u6821\u6b63\u529b\u5ea6:<br \/>\n \u03b2 &#061; 0 -&gt; \u4e0d\u6821\u6b63<br \/>\n \u03b2 &#061; 1 -&gt; \u5b8c\u5168\u6821\u6b63<\/p>\n<p>\u901a\u5e38 \u03b2 \u4ece 0.4 \u5f00\u59cb&#xff0c;\u968f\u8bad\u7ec3\u9010\u6e10\u589e\u5927\u5230 1.0\u3002<\/p>\n<p>\u5b9e\u73b0<\/p>\n<p> classPrioritizedReplayBuffer:<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u4f18\u5148\u7ecf\u9a8c\u56de\u653e\u7f13\u51b2\u533a\u3002  <\/p>\n<p>    \u7406\u8bba: \u6309 TD \u8bef\u5dee\u6bd4\u4f8b\u91c7\u6837\u8f6c\u6362\u3002<br \/>\n    \u6211\u4eec\u53ef\u4ee5\u4ece\u4e2d\u5b66\u5230\u66f4\u591a\u7684\u8f6c\u6362\u4f1a\u88ab\u66f4\u9891\u7e41\u5730\u91c7\u6837\u3002<br \/>\n    &#034;&#034;&#034;  <\/p>\n<p>    def__init__(self, capacity: int, alpha: float&#061;0.6, beta: float&#061;0.4):<br \/>\n        &#034;&#034;&#034;<br \/>\n        Args:<br \/>\n            capacity: \u7f13\u51b2\u533a\u6700\u5927\u5bb9\u91cf<br \/>\n            alpha: \u4f18\u5148\u5316\u6307\u6570&#xff08;0&#061;\u5747\u5300, 1&#061;\u6bd4\u4f8b&#xff09;<br \/>\n            beta: \u91cd\u8981\u6027\u91c7\u6837\u6307\u6570&#xff08;\u9000\u706b\u5230 1.0&#xff09;<br \/>\n        &#034;&#034;&#034;<br \/>\n        self.capacity&#061;capacity<br \/>\n        self.alpha&#061;alpha<br \/>\n        self.beta&#061;beta<br \/>\n        self.beta_increment&#061;0.001  # \u968f\u65f6\u95f4\u9000\u706b beta  <\/p>\n<p>        self.buffer&#061; []<br \/>\n        self.priorities&#061;np.zeros(capacity, dtype&#061;np.float32)<br \/>\n        self.position&#061;0  <\/p>\n<p>    defpush(self, state, action, reward, next_state, done):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u4ee5\u6700\u5927\u4f18\u5148\u7ea7\u6dfb\u52a0\u8f6c\u6362\u3002  <\/p>\n<p>        \u7406\u8bba: \u65b0\u8f6c\u6362\u83b7\u5f97\u6700\u5927\u4f18\u5148\u7ea7&#xff08;\u4f1a\u5f88\u5feb\u88ab\u91c7\u6837&#xff09;\u3002<br \/>\n        \u5b83\u4eec\u7684\u5b9e\u9645\u4f18\u5148\u7ea7\u5728\u9996\u6b21 TD \u8bef\u5dee\u8ba1\u7b97\u540e\u66f4\u65b0\u3002<br \/>\n        &#034;&#034;&#034;<br \/>\n        max_priority&#061;self.priorities.max() ifself.bufferelse1.0  <\/p>\n<p>        iflen(self.buffer) &lt;self.capacity:<br \/>\n            self.buffer.append((state, action, reward, next_state, done))<br \/>\n        else:<br \/>\n            self.buffer[self.position] &#061; (state, action, reward, next_state, done)  <\/p>\n<p>        self.priorities[self.position] &#061;max_priority<br \/>\n        self.position&#061; (self.position&#043;1) %self.capacity  <\/p>\n<p>    defsample(self, batch_size: int):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u6309\u4f18\u5148\u7ea7\u6bd4\u4f8b\u91c7\u6837\u6279\u6b21\u3002  <\/p>\n<p>        Returns:<br \/>\n            batch: \u91c7\u6837\u7684\u8f6c\u6362<br \/>\n            indices: \u91c7\u6837\u8f6c\u6362\u7684\u7d22\u5f15&#xff08;\u7528\u4e8e\u4f18\u5148\u7ea7\u66f4\u65b0&#xff09;<br \/>\n            weights: \u91cd\u8981\u6027\u91c7\u6837\u6743\u91cd<br \/>\n        &#034;&#034;&#034;<br \/>\n        iflen(self.buffer) &#061;&#061;self.capacity:<br \/>\n            priorities&#061;self.priorities<br \/>\n        else:<br \/>\n            priorities&#061;self.priorities[:len(self.buffer)]  <\/p>\n<p>        # \u8ba1\u7b97\u91c7\u6837\u6982\u7387<br \/>\n        probs&#061;priorities**self.alpha<br \/>\n        probs\/&#061;probs.sum()  <\/p>\n<p>        # \u91c7\u6837\u7d22\u5f15<br \/>\n        indices&#061;np.random.choice(len(self.buffer), batch_size, p&#061;probs, replace&#061;False)  <\/p>\n<p>        # \u83b7\u53d6\u8f6c\u6362<br \/>\n        batch&#061; [self.buffer[idx] foridxinindices]  <\/p>\n<p>        # \u8ba1\u7b97\u91cd\u8981\u6027\u91c7\u6837\u6743\u91cd<br \/>\n        total&#061;len(self.buffer)<br \/>\n        weights&#061; (total*probs[indices]) ** (-self.beta)<br \/>\n        weights\/&#061;weights.max()  # \u5f52\u4e00\u5316\u4ee5\u4fdd\u6301\u7a33\u5b9a\u6027  <\/p>\n<p>        # \u9000\u706b beta<br \/>\n        self.beta&#061;min(1.0, self.beta&#043;self.beta_increment)  <\/p>\n<p>        # \u8f6c\u6362\u4e3a tensor<br \/>\n        states, actions, rewards, next_states, dones&#061;zip(*batch)  <\/p>\n<p>        states&#061;torch.FloatTensor(np.array(states))<br \/>\n        actions&#061;torch.LongTensor(actions)<br \/>\n        rewards&#061;torch.FloatTensor(rewards)<br \/>\n        next_states&#061;torch.FloatTensor(np.array(next_states))<br \/>\n        dones&#061;torch.FloatTensor(dones)<br \/>\n        weights&#061;torch.FloatTensor(weights)  <\/p>\n<p>        return (states, actions, rewards, next_states, dones), indices, weights  <\/p>\n<p>    defupdate_priorities(self, indices, td_errors):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u6839\u636e TD \u8bef\u5dee\u66f4\u65b0\u4f18\u5148\u7ea7\u3002  <\/p>\n<p>        Args:<br \/>\n            indices: \u91c7\u6837\u8f6c\u6362\u7684\u7d22\u5f15<br \/>\n            td_errors: \u90a3\u4e9b\u8f6c\u6362\u7684 TD \u8bef\u5dee<br \/>\n        &#034;&#034;&#034;<br \/>\n        foridx, td_errorinzip(indices, td_errors):<br \/>\n            self.priorities[idx] &#061;abs(td_error) &#043;1e-6  <\/p>\n<p>    def__len__(self):<br \/>\n         returnlen(self.buffer)<\/p>\n<p>\u751f\u4ea7\u73af\u5883\u4f1a\u7528 sum-tree \u6570\u636e\u7ed3\u6784&#xff0c;\u91c7\u6837\u590d\u6742\u5ea6\u662f O(log N) \u800c\u4e0d\u662f\u8fd9\u91cc\u7684 O(N)\u3002\u8fd9\u4e2a\u7b80\u5316\u7248\u672c\u4ee5\u53ef\u8bfb\u6027\u4e3a\u4f18\u5148\u3002<\/p>\n<h3>DQN \u53d8\u4f53\u5bf9\u6bd4<\/h3>\n<p>\u51e0\u4e2a\u53d8\u4f53\u5404\u81ea\u89e3\u51b3\u4ec0\u4e48\u95ee\u9898\u5462&#xff1f;<\/p>\n<p>DQN \u662f\u57fa\u7ebf&#xff0c;\u7528\u5355\u4e00\u7f51\u7edc\u9009\u52a8\u4f5c\u3001\u8bc4\u4f30\u52a8\u4f5c\u3002\u5b83\u5f15\u5165\u4e86\u76ee\u6807\u7f51\u7edc\u6765\u7a33\u5b9a&#034;\u79fb\u52a8\u76ee\u6807&#034;\u95ee\u9898&#xff0c;\u4f46\u5bb9\u6613\u8fc7\u4f30\u8ba1 Q \u503c&#xff0c;\u566a\u58f0\u8ba9\u667a\u80fd\u4f53\u53bb\u8ffd\u9010\u6839\u672c\u4e0d\u5b58\u5728\u7684&#034;\u5e7d\u7075\u5956\u52b1&#034;\u3002<\/p>\n<p>Double DQN \u628a\u9009\u548c\u8bc4\u62c6\u5f00\u3002\u5728\u7ebf\u7f51\u7edc\u9009\u52a8\u4f5c&#xff0c;\u76ee\u6807\u7f51\u7edc\u8bc4\u4f30\u4ef7\u503c\u3002\u5b9e\u6d4b\u4e0b\u6765\u80fd\u6709\u6548\u538b\u4f4e\u4e0d\u5207\u5b9e\u9645\u7684 Q \u503c&#xff0c;\u5b66\u4e60\u66f2\u7ebf\u660e\u663e\u66f4\u5e73\u6ed1\u3002<\/p>\n<p>Dueling DQN \u6362\u4e86\u7f51\u7edc\u67b6\u6784&#xff0c;\u5355\u72ec\u5b66 V(s) \u548c A(s,a)\u3002\u5b83\u7684\u6838\u5fc3\u8ba4\u77e5\u662f&#xff1a;\u5f88\u591a\u72b6\u6001\u4e0b\u5177\u4f53\u52a8\u4f5c\u7684\u5f71\u54cd\u4e0d\u5927\u3002\u5728 LunarLander \u8fd9\u79cd\u5b58\u5728\u5927\u91cf&#034;\u5197\u4f59\u52a8\u4f5c&#034;\u7684\u73af\u5883\u91cc&#xff0c;\u6837\u672c\u6548\u7387\u63d0\u5347\u660e\u663e\u2014\u2014\u4e0d\u7528\u4e3a\u6bcf\u6b21\u5f15\u64ce\u8109\u51b2\u90fd\u91cd\u65b0\u5b66\u72b6\u6001\u503c\u3002<\/p>\n<p>Double Dueling DQN \u628a\u4e24\u8fb9\u7684\u597d\u5904\u7ed3\u5408\u8d77\u6765&#xff0c;\u65e2\u51cf\u5c11\u4f30\u8ba1\u566a\u58f0&#xff0c;\u53c8\u63d0\u9ad8\u8868\u793a\u6548\u7387\u3002\u5b9e\u6d4b\u4e2d\u8fd9\u4e2a\u7ec4\u5408\u6700\u7a33\u5065&#xff0c;\u8fbe\u5230\u5cf0\u503c\u6027\u80fd\u7684\u901f\u5ea6\u548c\u53ef\u9760\u6027\u90fd\u4f18\u4e8e\u5355\u4e00\u6539\u8fdb\u3002<\/p>\n<h3>\u5b9e\u8df5\u5efa\u8bae<\/h3>\n<p>\u53d8\u4f53\u9009\u62e9\u5bf9\u6bd4 <img decoding=\"async\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2026\/02\/20260201140558-697f5dc6dcd78.jpg\" alt=\"\" \/> Double DQN \u8dd1\u5f97\u6bd4 DQN \u8fd8\u5dee&#xff1f;\u53ef\u80fd\u662f\u8bad\u7ec3\u4e0d\u591f\u957f&#xff08;Double DQN \u8d77\u6b65\u5076\u5c14\u6162\u4e00\u70b9&#xff09;&#xff0c;\u6216\u8005\u76ee\u6807\u7f51\u7edc\u66f4\u65b0\u592a\u9891\u7e41&#xff0c;\u6216\u8005\u5b66\u4e60\u7387\u504f\u9ad8\u3002\u8fd9\u65f6\u53ef\u4ee5\u5c06\u8bad\u7ec3\u65f6\u95f4\u7ffb\u500d&#xff0c;target_update_freq \u8c03\u5927&#xff0c;\u5b66\u4e60\u7387\u780d 2-5 \u500d\u3002<\/p>\n<p>Dueling \u67b6\u6784\u6ca1\u5e26\u6765\u6539\u5584&#xff1f;\u53ef\u80fd\u662f\u73af\u5883\u672c\u8eab\u4e0d\u9002\u5408&#xff08;\u6240\u6709\u72b6\u6001\u90fd\u5f88\u5173\u952e&#xff09;&#xff0c;\u6216\u8005\u7f51\u7edc\u592a\u5c0f&#xff0c;\u6216\u8005\u503c\u6d41\/\u4f18\u52bf\u6d41\u592a\u6d45\u3002\u9700\u8981\u5bf9\u7f51\u7edc\u52a0\u5bbd\u52a0\u6df1&#xff0c;\u786e\u8ba4\u73af\u5883\u91cc\u786e\u5b9e\u6709&#034;\u4e2d\u6027&#034;\u72b6\u6001\u3002<\/p>\n<p>PER \u5bfc\u81f4\u4e0d\u7a33\u5b9a&#xff1f;\u53ef\u80fd\u662f \u03b2 \u9000\u706b\u592a\u5feb\u3001\u03b1 \u8bbe\u592a\u9ad8\u3001\u91cd\u8981\u6027\u91c7\u6837\u6743\u91cd\u6ca1\u5f52\u4e00\u5316\u3002\u53ef\u4ee5\u51cf\u6162 \u03b2 \u589e\u91cf\u3001\u03b1 \u964d\u5230 0.4-0.6\u3001\u786e\u8ba4\u6743\u91cd\u505a\u4e86\u5f52\u4e00\u5316\u3002<\/p>\n<p>\u9996\u9009 Double DQN \u8d77\u6b65&#xff0c;\u4ee3\u7801\u6539\u52a8\u6781\u5c0f&#xff0c;\u6536\u76ca\u660e\u786e&#xff0c;\u6ca1\u6709\u989d\u5916\u590d\u6742\u5ea6\u3002<\/p>\n<p>\u4ec0\u4e48\u65f6\u5019\u52a0 Dueling&#xff1a;\u72b6\u6001\u503c\u6bd4\u52a8\u4f5c\u4f18\u52bf\u66f4\u91cd\u8981\u7684\u73af\u5883&#xff0c;\u5927\u91cf\u72b6\u6001\u4e0b\u52a8\u4f5c\u503c\u5dee\u4e0d\u591a&#xff0c;\u9700\u8981\u66f4\u5feb\u6536\u655b\u3002<\/p>\n<p>\u4ec0\u4e48\u65f6\u5019\u52a0 PER&#xff1a;\u6837\u672c\u6548\u7387\u81f3\u5173\u91cd\u8981&#xff0c;\u6709\u7b97\u529b\u9884\u7b97&#xff08;PER \u6bd4\u5747\u5300\u91c7\u6837\u6162&#xff09;&#xff0c;\u5956\u52b1\u7a00\u758f&#xff08;\u5e2e\u52a9\u5173\u6ce8\u5c11\u89c1\u7684\u6210\u529f\u7ecf\u9a8c&#xff09;\u3002<\/p>\n<p>\u6700\u540eRainbow \u628a\u516d\u9879\u6539\u8fdb\u53e0\u5728\u4e00\u8d77&#xff1a;Double DQN\u3001Dueling DQN\u3001\u4f18\u5148\u7ecf\u9a8c\u56de\u653e\u3001\u591a\u6b65\u5b66\u4e60&#xff08;n-step returns&#xff09;\u3001\u5206\u5e03\u5f0f RL&#xff08;C51&#xff09;\u3001\u566a\u58f0\u7f51\u7edc&#xff08;\u53c2\u6570\u7a7a\u95f4\u63a2\u7d22&#xff09;\u3002<\/p>\n<p>\u591a\u6b65\u5b66\u4e60\u628a 1-step TD \u6362\u6210 n-step \u56de\u62a5&#xff1a;<\/p>\n<p> # 1-step TD:<br \/>\n y &#061; r\u209c &#043; \u03b3\u00b7max Q(s\u209c\u208a\u2081, a)  <\/p>\n<p> # n-step:<br \/>\n y &#061; r\u209c &#043; \u03b3\u00b7r\u209c\u208a\u2081 &#043; \u03b3\u00b2\u00b7r\u209c\u208a\u2082 &#043; &#8230; &#043; \u03b3\u207f\u00b7max Q(s\u209c\u208a\u2099, a)<\/p>\n<p>\u597d\u5904\u662f\u4fe1\u7528\u5206\u914d\u66f4\u6e05\u6670&#xff0c;\u5b66\u4e60\u66f4\u5feb\u3002<\/p>\n<h3>\u5c0f\u7ed3<\/h3>\n<p>\u8fd9\u7bc7\u6587\u7ae0\u4ece DQN \u7684\u8fc7\u4f30\u8ba1\u95ee\u9898\u8bb2\u8d77&#xff0c;\u6cbf\u7740 Double DQN\u3001Dueling \u67b6\u6784\u3001\u4f18\u5148\u7ecf\u9a8c\u56de\u653e\u7b49\u7b49\u4ecb\u7ecd\u4e0b\u6765&#xff0c;\u6bcf\u79cd\u6539\u8fdb\u5bf9\u5e94\u4e00\u4e2a\u5177\u4f53\u7684\u5931\u8d25\u6a21\u5f0f&#xff1a;max \u7b97\u5b50\u7684\u504f\u5dee\u3001\u4f4e\u6548\u7684\u72b6\u6001-\u52a8\u4f5c\u8868\u793a\u3001\u6d6a\u8d39\u7684\u5747\u5300\u91c7\u6837\u3002<\/p>\n<p>\u4ece\u5934\u5b9e\u73b0\u8fd9\u4e9b\u65b9\u6cd5&#xff0c;\u80fd\u641e\u6e05\u695a\u5b83\u4eec\u4e3a\u4ec0\u4e48\u6709\u6548&#xff1b;\u5f88\u591a&#034;\u9ad8\u7ea7&#034; RL \u7b97\u6cd5\u4e0d\u8fc7\u662f\u7b80\u5355\u60f3\u6cd5\u7684\u7ec4\u5408&#xff0c;\u7406\u89e3\u8fd9\u4e9b\u60f3\u6cd5\u672c\u8eab\u624d\u662f\u771f\u6b63\u53ef\u6269\u5c55\u7684\u4e1c\u897f\u3002<\/p>\n<p>https:\/\/avoid.overfit.cn\/post\/4c5835f419d840b0acb0a1eb72f92b6f<\/p>\n<p>\u4f5c\u8005&#xff1a; Jugal Gajjar<\/p>\n","protected":false},"excerpt":{"rendered":"<p>DQN \u7528<br \/>\nmax Q(s,a)\u8ba1\u7b97\u76ee\u6807\u503c&#xff0c;\u7b49\u4e8e\u5728\u6311 Q \u503c\u6700\u9ad8\u7684\u52a8\u4f5c&#xff0c;\u4f46\u662f\u8fd9\u4e9b\u52a8\u4f5c\u4e2d\u5305\u62ec\u4e86\u90a3\u4e9b\u56e0\u4e3a\u4f30\u8ba1\u566a\u58f0\u800c\u88ab\u9ad8\u4f30\u7684\u52a8\u4f5c&#xff0c;\u7d20\u4ee5\u5c31\u4f1a\u4ea7\u751f\u8fc7\u4f30\u8ba1\u504f\u5dee&#xff0c;\u76f4\u63a5\u540e\u679c\u662f\u8bad\u7ec3\u4e0d\u7a33\u5b9a\u3001\u7b56\u7565\u6b21\u4f18\u3002<br \/>\n\u8fd9\u7bc7\u6587\u7ae0\u8981\u89e3\u51b3\u7684\u5c31\u662f\u8fd9\u4e2a\u95ee\u9898&#xff0c;\u5185\u5bb9\u5305\u62ec&#xff1a;DQN \u4e3a\u4ec0\u4e48\u4f1a\u8fc7\u4f30\u8ba1\u3001Double DQN \u600e\u4e48\u628a\u52a8\u4f5c\u9009\u62e9\u548c\u8bc4\u4f30\u62c6\u5f00\u3001Dueling DQN \u600e\u4e48\u5206\u79bb\u72b6\u6001\u503c\u548c\u52a8\u4f5c\u4f18\u52bf\u3001\u4f18\u5148\u7ecf\u9a8c\u56de\u653e\u5982\u4f55\u8ba9\u91c7\u6837\u66f4\u806a\u660e&#xff0c;\u4ee5\u53ca<\/p>\n","protected":false},"author":2,"featured_media":70378,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[152,50,752,86],"topic":[],"class_list":["post-70380","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-server","tag-pytorch","tag-50","tag-752","tag-86"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v20.3 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>\u8ba9 Q \u503c\u4f30\u8ba1\u66f4\u51c6\u786e\uff1a\u4ece DQN \u5230 Double DQN \u7684\u6539\u8fdb\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.wsisp.com\/helps\/70380.html\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"\u8ba9 Q \u503c\u4f30\u8ba1\u66f4\u51c6\u786e\uff1a\u4ece DQN \u5230 Double DQN \u7684\u6539\u8fdb\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"og:description\" content=\"DQN \u7528 max Q(s,a)\u8ba1\u7b97\u76ee\u6807\u503c&#xff0c;\u7b49\u4e8e\u5728\u6311 Q \u503c\u6700\u9ad8\u7684\u52a8\u4f5c&#xff0c;\u4f46\u662f\u8fd9\u4e9b\u52a8\u4f5c\u4e2d\u5305\u62ec\u4e86\u90a3\u4e9b\u56e0\u4e3a\u4f30\u8ba1\u566a\u58f0\u800c\u88ab\u9ad8\u4f30\u7684\u52a8\u4f5c&#xff0c;\u7d20\u4ee5\u5c31\u4f1a\u4ea7\u751f\u8fc7\u4f30\u8ba1\u504f\u5dee&#xff0c;\u76f4\u63a5\u540e\u679c\u662f\u8bad\u7ec3\u4e0d\u7a33\u5b9a\u3001\u7b56\u7565\u6b21\u4f18\u3002 \u8fd9\u7bc7\u6587\u7ae0\u8981\u89e3\u51b3\u7684\u5c31\u662f\u8fd9\u4e2a\u95ee\u9898&#xff0c;\u5185\u5bb9\u5305\u62ec&#xff1a;DQN \u4e3a\u4ec0\u4e48\u4f1a\u8fc7\u4f30\u8ba1\u3001Double DQN \u600e\u4e48\u628a\u52a8\u4f5c\u9009\u62e9\u548c\u8bc4\u4f30\u62c6\u5f00\u3001Dueling DQN \u600e\u4e48\u5206\u79bb\u72b6\u6001\u503c\u548c\u52a8\u4f5c\u4f18\u52bf\u3001\u4f18\u5148\u7ecf\u9a8c\u56de\u653e\u5982\u4f55\u8ba9\u91c7\u6837\u66f4\u806a\u660e&#xff0c;\u4ee5\u53ca\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.wsisp.com\/helps\/70380.html\" \/>\n<meta property=\"og:site_name\" content=\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"article:published_time\" content=\"2026-02-01T14:05:59+00:00\" \/>\n<meta property=\"og:image\" content=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2026\/02\/20260201140558-697f5dc698df7.png\" \/>\n<meta name=\"author\" content=\"admin\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"admin\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"10 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/70380.html\",\"url\":\"https:\/\/www.wsisp.com\/helps\/70380.html\",\"name\":\"\u8ba9 Q \u503c\u4f30\u8ba1\u66f4\u51c6\u786e\uff1a\u4ece DQN \u5230 Double DQN \u7684\u6539\u8fdb\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"isPartOf\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\"},\"datePublished\":\"2026-02-01T14:05:59+00:00\",\"dateModified\":\"2026-02-01T14:05:59+00:00\",\"author\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\"},\"breadcrumb\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/70380.html#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.wsisp.com\/helps\/70380.html\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/70380.html#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/www.wsisp.com\/helps\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\u8ba9 Q \u503c\u4f30\u8ba1\u66f4\u51c6\u786e\uff1a\u4ece DQN \u5230 Double DQN \u7684\u6539\u8fdb\u65b9\u6848\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\",\"url\":\"https:\/\/www.wsisp.com\/helps\/\",\"name\":\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"description\":\"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\",\"name\":\"admin\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"contentUrl\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"caption\":\"admin\"},\"sameAs\":[\"http:\/\/wp.wsisp.com\"],\"url\":\"https:\/\/www.wsisp.com\/helps\/author\/admin\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u8ba9 Q \u503c\u4f30\u8ba1\u66f4\u51c6\u786e\uff1a\u4ece DQN \u5230 Double DQN \u7684\u6539\u8fdb\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.wsisp.com\/helps\/70380.html","og_locale":"zh_CN","og_type":"article","og_title":"\u8ba9 Q \u503c\u4f30\u8ba1\u66f4\u51c6\u786e\uff1a\u4ece DQN \u5230 Double DQN \u7684\u6539\u8fdb\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","og_description":"DQN \u7528 max Q(s,a)\u8ba1\u7b97\u76ee\u6807\u503c&#xff0c;\u7b49\u4e8e\u5728\u6311 Q \u503c\u6700\u9ad8\u7684\u52a8\u4f5c&#xff0c;\u4f46\u662f\u8fd9\u4e9b\u52a8\u4f5c\u4e2d\u5305\u62ec\u4e86\u90a3\u4e9b\u56e0\u4e3a\u4f30\u8ba1\u566a\u58f0\u800c\u88ab\u9ad8\u4f30\u7684\u52a8\u4f5c&#xff0c;\u7d20\u4ee5\u5c31\u4f1a\u4ea7\u751f\u8fc7\u4f30\u8ba1\u504f\u5dee&#xff0c;\u76f4\u63a5\u540e\u679c\u662f\u8bad\u7ec3\u4e0d\u7a33\u5b9a\u3001\u7b56\u7565\u6b21\u4f18\u3002 \u8fd9\u7bc7\u6587\u7ae0\u8981\u89e3\u51b3\u7684\u5c31\u662f\u8fd9\u4e2a\u95ee\u9898&#xff0c;\u5185\u5bb9\u5305\u62ec&#xff1a;DQN \u4e3a\u4ec0\u4e48\u4f1a\u8fc7\u4f30\u8ba1\u3001Double DQN \u600e\u4e48\u628a\u52a8\u4f5c\u9009\u62e9\u548c\u8bc4\u4f30\u62c6\u5f00\u3001Dueling DQN \u600e\u4e48\u5206\u79bb\u72b6\u6001\u503c\u548c\u52a8\u4f5c\u4f18\u52bf\u3001\u4f18\u5148\u7ecf\u9a8c\u56de\u653e\u5982\u4f55\u8ba9\u91c7\u6837\u66f4\u806a\u660e&#xff0c;\u4ee5\u53ca","og_url":"https:\/\/www.wsisp.com\/helps\/70380.html","og_site_name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","article_published_time":"2026-02-01T14:05:59+00:00","og_image":[{"url":"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2026\/02\/20260201140558-697f5dc698df7.png"}],"author":"admin","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"admin","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"10 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.wsisp.com\/helps\/70380.html","url":"https:\/\/www.wsisp.com\/helps\/70380.html","name":"\u8ba9 Q \u503c\u4f30\u8ba1\u66f4\u51c6\u786e\uff1a\u4ece DQN \u5230 Double DQN \u7684\u6539\u8fdb\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","isPartOf":{"@id":"https:\/\/www.wsisp.com\/helps\/#website"},"datePublished":"2026-02-01T14:05:59+00:00","dateModified":"2026-02-01T14:05:59+00:00","author":{"@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41"},"breadcrumb":{"@id":"https:\/\/www.wsisp.com\/helps\/70380.html#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.wsisp.com\/helps\/70380.html"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.wsisp.com\/helps\/70380.html#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/www.wsisp.com\/helps"},{"@type":"ListItem","position":2,"name":"\u8ba9 Q \u503c\u4f30\u8ba1\u66f4\u51c6\u786e\uff1a\u4ece DQN \u5230 Double DQN \u7684\u6539\u8fdb\u65b9\u6848"}]},{"@type":"WebSite","@id":"https:\/\/www.wsisp.com\/helps\/#website","url":"https:\/\/www.wsisp.com\/helps\/","name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","description":"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41","name":"admin","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/","url":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","contentUrl":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","caption":"admin"},"sameAs":["http:\/\/wp.wsisp.com"],"url":"https:\/\/www.wsisp.com\/helps\/author\/admin"}]}},"_links":{"self":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/70380","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/comments?post=70380"}],"version-history":[{"count":0,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/70380\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media\/70378"}],"wp:attachment":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media?parent=70380"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/categories?post=70380"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/tags?post=70380"},{"taxonomy":"topic","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/topic?post=70380"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}