{"id":75296,"date":"2026-02-11T23:57:43","date_gmt":"2026-02-11T15:57:43","guid":{"rendered":"https:\/\/www.wsisp.com\/helps\/75296.html"},"modified":"2026-02-11T23:57:43","modified_gmt":"2026-02-11T15:57:43","slug":"%e6%b7%b1%e5%ba%a6%e5%bc%ba%e5%8c%96%e5%ad%a6%e4%b9%a0%e5%85%a8%e6%a0%88%e6%8c%87%e5%8d%97%ef%bc%9a%e4%bb%8e%e7%90%86%e8%ae%ba%e5%8e%9f%e7%90%86%e5%88%b0%e6%9c%ba%e5%99%a8%e4%ba%ba%e5%ae%9e%e6%88%98","status":"publish","type":"post","link":"https:\/\/www.wsisp.com\/helps\/75296.html","title":{"rendered":"\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357\uff1a\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218"},"content":{"rendered":"<h2>\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357&#xff1a;\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218<\/h2>\n<p>\u6458\u8981&#xff1a;\u672c\u6587\u7cfb\u7edf\u68b3\u7406\u5f3a\u5316\u5b66\u4e60&#xff08;Reinforcement Learning, RL&#xff09;\u7684\u7406\u8bba\u6846\u67b6\u4e0e\u6280\u672f\u4f53\u7cfb&#xff0c;\u6df1\u5165\u5256\u6790\u5176\u4e0e\u76d1\u7763\u5b66\u4e60\u3001\u65e0\u76d1\u7763\u5b66\u4e60\u7684\u672c\u8d28\u5dee\u5f02&#xff0c;\u91cd\u70b9\u8bb2\u89e3\u57fa\u4e8e\u4eba\u7c7b\u53cd\u9988\u7684\u5f3a\u5316\u5b66\u4e60&#xff08;RLHF&#xff09;\u5728GPT\u7b49\u5927\u6a21\u578b\u8bad\u7ec3\u4e2d\u7684\u5e94\u7528&#xff0c;\u5e76\u901a\u8fc7\u5177\u8eab\u667a\u80fd&#xff08;Embodied AI&#xff09;\u4e0e\u673a\u5668\u4eba\u63a7\u5236\u5b9e\u6218\u6848\u4f8b&#xff0c;\u5c55\u793aRL\u5728\u771f\u5b9e\u573a\u666f\u4e2d\u7684\u843d\u5730\u8def\u5f84\u3002\u6700\u540e\u63d0\u4f9b\u5b8c\u6574\u7684\u4ee3\u7801\u5b9e\u73b0\u6846\u67b6\u4e0e\u5f00\u6e90\u8d44\u6e90\u5bfc\u822a&#xff0c;\u52a9\u529b\u5f00\u53d1\u8005\u5feb\u901f\u5165\u95e8\u5f3a\u5316\u5b66\u4e60\u5de5\u7a0b\u5b9e\u8df5\u3002<\/p>\n<p>\u5173\u952e\u8bcd&#xff1a;\u5f3a\u5316\u5b66\u4e60&#xff1b;RLHF&#xff1b;\u5177\u8eab\u667a\u80fd&#xff1b;\u673a\u5668\u4eba\u63a7\u5236&#xff1b;Stable Baselines&#xff1b;Gym<\/p>\n<hr \/>\n<h3>\u4e00\u3001\u5f3a\u5316\u5b66\u4e60\u57fa\u672c\u6982\u5ff5\u4e0e\u6838\u5fc3\u6846\u67b6<\/h3>\n<h4>1.1 \u5b9a\u4e49&#xff1a;\u667a\u80fd\u4f53\u7684\u5e8f\u8d2f\u51b3\u7b56\u8fc7\u7a0b<\/h4>\n<p>\u5f3a\u5316\u5b66\u4e60&#xff08;Reinforcement Learning, RL&#xff09; \u662f\u673a\u5668\u5b66\u4e60\u7684\u91cd\u8981\u8303\u5f0f&#xff0c;\u5176\u6838\u5fc3\u5728\u4e8e\u8bad\u7ec3\u667a\u80fd\u4f53&#xff08;Agent&#xff09; \u5728\u590d\u6742\u4e0d\u786e\u5b9a\u73af\u5883&#xff08;Environment&#xff09; \u4e2d&#xff0c;\u901a\u8fc7\u8bd5\u9519\u5b66\u4e60&#xff08;Trial-and-Error&#xff09; \u6700\u5927\u5316\u957f\u671f\u7d2f\u79ef\u5956\u52b1&#xff08;Cumulative Reward&#xff09;\u3002<\/p>\n<p>\u4e0e\u76d1\u7763\u5b66\u4e60\u7684&#034;\u5f00\u5377\u8003\u8bd5&#034;\u6a21\u5f0f\u4e0d\u540c&#xff0c;\u5f3a\u5316\u5b66\u4e60\u66f4\u63a5\u8fd1\u63a2\u7d22-\u5229\u7528\u56f0\u5883&#xff08;Exploration-Exploitation Dilemma&#xff09;&#xff1a;\u667a\u80fd\u4f53\u9700\u8981\u5728\u5c1d\u8bd5\u65b0\u7b56\u7565&#xff08;\u63a2\u7d22&#xff09;\u4e0e\u6267\u884c\u5df2\u77e5\u6700\u4f18\u7b56\u7565&#xff08;\u5229\u7528&#xff09;\u4e4b\u95f4\u53d6\u5f97\u5e73\u8861&#xff0c;\u4ee5\u83b7\u53d6\u6700\u5927\u957f\u671f\u6536\u76ca\u3002<\/p>\n<p>\u6570\u5b66\u5f62\u5f0f\u5316&#xff08;\u9a6c\u5c14\u53ef\u592b\u51b3\u7b56\u8fc7\u7a0b&#xff0c;MDP&#xff09;&#xff1a;<\/p>\n<p><span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         M<\/p>\n<p>         &#061;<\/p>\n<p>         (<\/p>\n<p>         S<\/p>\n<p>         ,<\/p>\n<p>         A<\/p>\n<p>         ,<\/p>\n<p>         P<\/p>\n<p>         ,<\/p>\n<p>         R<\/p>\n<p>         ,<\/p>\n<p>         \u03b3<\/p>\n<p>         )<\/p>\n<p>        \\\\mathcal{M} &#061; (\\\\mathcal{S}, \\\\mathcal{A}, \\\\mathcal{P}, \\\\mathcal{R}, \\\\gamma)<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathcal\">M<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">(<\/span><span class=\"mord mathcal\" style=\"margin-right: 0.075em\">S<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathcal\">A<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathcal\" style=\"margin-right: 0.0822em\">P<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathcal\">R<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0556em\">\u03b3<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<p>\u5176\u4e2d&#xff1a;<\/p>\n<ul>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         S<\/p>\n<p>        \\\\mathcal{S}<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathcal\" style=\"margin-right: 0.075em\">S<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u72b6\u6001\u7a7a\u95f4&#xff08;State Space&#xff09;<\/li>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         A<\/p>\n<p>        \\\\mathcal{A}<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathcal\">A<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u52a8\u4f5c\u7a7a\u95f4&#xff08;Action Space&#xff09;<\/li>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         P<\/p>\n<p>         (<\/p>\n<p>          s<\/p>\n<p>          \u2032<\/p>\n<p>         \u2223<\/p>\n<p>         s<\/p>\n<p>         ,<\/p>\n<p>         a<\/p>\n<p>         )<\/p>\n<p>        \\\\mathcal{P}(s&#039;|s,a)<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1.0019em;vertical-align: -0.25em\"><\/span><span class=\"mord mathcal\" style=\"margin-right: 0.0822em\">P<\/span><span class=\"mopen\">(<\/span><span class=\"mord\"><span class=\"mord mathnormal\">s<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.7519em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mtight\">\u2032<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mord\">\u2223<\/span><span class=\"mord mathnormal\">s<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\">a<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u72b6\u6001\u8f6c\u79fb\u6982\u7387<\/li>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         R<\/p>\n<p>         (<\/p>\n<p>         s<\/p>\n<p>         ,<\/p>\n<p>         a<\/p>\n<p>         )<\/p>\n<p>        \\\\mathcal{R}(s,a)<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord mathcal\">R<\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">s<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\">a<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u5373\u65f6\u5956\u52b1\u51fd\u6570<\/li>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         \u03b3<\/p>\n<p>         \u2208<\/p>\n<p>         [<\/p>\n<p>         0<\/p>\n<p>         ,<\/p>\n<p>         1<\/p>\n<p>         ]<\/p>\n<p>        \\\\gamma \\\\in [0,1]<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7335em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0556em\">\u03b3<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">\u2208<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">[<\/span><span class=\"mord\">0<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\">1<\/span><span class=\"mclose\">]<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u6298\u6263\u56e0\u5b50&#xff08;\u5e73\u8861\u5373\u65f6\u4e0e\u957f\u671f\u5956\u52b1&#xff09;<\/li>\n<\/ul>\n<h4>1.2 \u5173\u952e\u7ec4\u4ef6\u89e3\u6790<\/h4>\n<table>\n<tr>\u7ec4\u4ef6\u529f\u80fd\u63cf\u8ff0\u6280\u672f\u5b9e\u73b0\u8981\u70b9<\/tr>\n<tbody>\n<tr>\n<td>\u7b56\u7565&#xff08;Policy&#xff09;<\/td>\n<td><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           \u03c0<\/p>\n<p>           (<\/p>\n<p>           a<\/p>\n<p>           \u2223<\/p>\n<p>           s<\/p>\n<p>           )<\/p>\n<p>          \\\\pi(a|s)<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">\u03c0<\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">a<\/span><span class=\"mord\">\u2223<\/span><span class=\"mord mathnormal\">s<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u72b6\u6001\u5230\u52a8\u4f5c\u7684\u6620\u5c04<\/td>\n<td>\u968f\u673a\u7b56\u7565&#xff08;Softmax&#xff09;vs \u786e\u5b9a\u6027\u7b56\u7565&#xff08;Argmax&#xff09;<\/td>\n<\/tr>\n<tr>\n<td>\u4ef7\u503c\u51fd\u6570&#xff08;Value Function&#xff09;<\/td>\n<td><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>            V<\/p>\n<p>            \u03c0<\/p>\n<p>           (<\/p>\n<p>           s<\/p>\n<p>           )<\/p>\n<p>          V^\\\\pi(s)<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.6644em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0359em\">\u03c0<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">s<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u72b6\u6001\u957f\u671f\u4ef7\u503c\u8bc4\u4f30<\/td>\n<td>\u8499\u7279\u5361\u6d1b\u4f30\u8ba1 vs \u65f6\u5e8f\u5dee\u5206&#xff08;TD&#xff09;\u5b66\u4e60<\/td>\n<\/tr>\n<tr>\n<td>Q\u51fd\u6570&#xff08;Action-Value&#xff09;<\/td>\n<td><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>            Q<\/p>\n<p>            \u03c0<\/p>\n<p>           (<\/p>\n<p>           s<\/p>\n<p>           ,<\/p>\n<p>           a<\/p>\n<p>           )<\/p>\n<p>          Q^\\\\pi(s,a)<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">Q<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.6644em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0359em\">\u03c0<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">s<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\">a<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u72b6\u6001-\u52a8\u4f5c\u5bf9\u4ef7\u503c<\/td>\n<td>DQN\u3001Double DQN\u3001Dueling DQN\u67b6\u6784<\/td>\n<\/tr>\n<tr>\n<td>\u6a21\u578b&#xff08;Model&#xff09;<\/td>\n<td>\u73af\u5883\u52a8\u6001\u7684\u5b66\u4e60\u8868\u793a<\/td>\n<td>Model-Based RL&#xff08;\u5982MuZero&#xff09;vs Model-Free RL<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<hr \/>\n<h3>\u4e8c\u3001\u4e09\u5927\u5b66\u4e60\u8303\u5f0f\u5bf9\u6bd4&#xff1a;\u5f3a\u5316\u5b66\u4e60\u7684\u72ec\u7279\u5b9a\u4f4d<\/h3>\n<p>\u7406\u89e3\u5f3a\u5316\u5b66\u4e60\u5728\u673a\u5668\u5b66\u4e60\u7248\u56fe\u4e2d\u7684\u4f4d\u7f6e&#xff0c;\u662f\u638c\u63e1\u5176\u5e94\u7528\u573a\u666f\u7684\u524d\u63d0\u3002<\/p>\n<h4>2.1 \u76d1\u7763\u5b66\u4e60&#xff08;Supervised Learning&#xff09;<\/h4>\n<p>\u6838\u5fc3\u7279\u5f81&#xff1a;\u6709\u6807\u7b7e\u53cd\u9988&#xff0c;\u9884\u6d4b\u672a\u6765<\/p>\n<ul>\n<li>\u6570\u636e\u6d41&#xff1a;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         (<\/p>\n<p>          x<\/p>\n<p>          i<\/p>\n<p>         ,<\/p>\n<p>          y<\/p>\n<p>          i<\/p>\n<p>         )<\/p>\n<p>        (x_i, y_i)<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">(<\/span><span class=\"mord\"><span class=\"mord mathnormal\">x<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3117em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">i<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">y<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3117em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">i<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span> \u8f93\u5165-\u8f93\u51fa\u5bf9<\/li>\n<li>\u76ee\u6807&#xff1a;\u5b66\u4e60\u6620\u5c04\u51fd\u6570 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         f<\/p>\n<p>         :<\/p>\n<p>         X<\/p>\n<p>         \u2192<\/p>\n<p>         Y<\/p>\n<p>        f: \\\\mathcal{X} \\\\to \\\\mathcal{Y}<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8889em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.1076em\">f<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">:<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathcal\" style=\"margin-right: 0.1464em\">X<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">\u2192<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7805em;vertical-align: -0.0972em\"><\/span><span class=\"mord mathcal\" style=\"margin-right: 0.0822em\">Y<\/span><\/span><\/span><\/span><\/span> \u6700\u5c0f\u5316\u9884\u6d4b\u8bef\u5dee<\/li>\n<li>\u5178\u578b\u4efb\u52a1&#xff1a;\u56fe\u50cf\u5206\u7c7b&#xff08;\u5982\u732b\u72d7\u8bc6\u522b&#xff09;\u3001\u60c5\u611f\u5206\u6790\u3001\u8bed\u97f3\u8bc6\u522b<\/li>\n<li>\u5c40\u9650\u6027&#xff1a;\u4f9d\u8d56\u5927\u91cf\u6807\u6ce8\u6570\u636e&#xff0c;\u96be\u4ee5\u5904\u7406\u52a8\u6001\u4ea4\u4e92\u573a\u666f<\/li>\n<\/ul>\n<p>\u7c7b\u6bd4&#xff1a;\u5982\u540c\u5b66\u751f\u901a\u8fc7\u6807\u51c6\u7b54\u6848&#xff08;\u6807\u7b7e&#xff09;\u5b66\u4e60&#xff0c;\u9002\u5408\u6709\u660e\u786e&#034;\u6b63\u786e\u7b54\u6848&#034;\u7684\u4efb\u52a1\u3002<\/p>\n<h4>2.2 \u65e0\u76d1\u7763\u5b66\u4e60&#xff08;Unsupervised Learning&#xff09;<\/h4>\n<p>\u6838\u5fc3\u7279\u5f81&#xff1a;\u6316\u6398\u6570\u636e\u4e2d\u7684\u9690\u85cf\u7ed3\u6784&#xff0c;\u65e0\u76f4\u63a5\u53cd\u9988<\/p>\n<ul>\n<li>\u6570\u636e\u6d41&#xff1a;\u4ec5\u542b\u8f93\u5165 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         {<\/p>\n<p>          x<\/p>\n<p>          i<\/p>\n<p>         }<\/p>\n<p>        \\\\{x_i\\\\}<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">{<\/span><span class=\"mord\"><span class=\"mord mathnormal\">x<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3117em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">i<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">}<\/span><\/span><\/span><\/span><\/span>&#xff0c;\u65e0\u6807\u7b7e<\/li>\n<li>\u76ee\u6807&#xff1a;\u53d1\u73b0\u6570\u636e\u5185\u5728\u5206\u5e03\u89c4\u5f8b&#xff08;\u805a\u7c7b\u3001\u964d\u7ef4\u3001\u5bc6\u5ea6\u4f30\u8ba1&#xff09;<\/li>\n<li>\u5178\u578b\u4efb\u52a1&#xff1a;\u5ba2\u6237\u5206\u7fa4\u3001\u5f02\u5e38\u68c0\u6d4b\u3001\u7279\u5f81\u5b66\u4e60<\/li>\n<li>\u5c40\u9650\u6027&#xff1a;\u7f3a\u4e4f\u660e\u786e\u4f18\u5316\u76ee\u6807&#xff0c;\u8bc4\u4f30\u56f0\u96be<\/li>\n<\/ul>\n<p>\u7c7b\u6bd4&#xff1a;\u5982\u540c\u5b66\u751f\u81ea\u4e3b\u6574\u7406\u7b14\u8bb0&#xff0c;\u53d1\u73b0\u77e5\u8bc6\u70b9\u95f4\u7684\u5173\u8054&#xff0c;\u4f46\u65e0\u660e\u786e\u5bf9\u9519\u6807\u51c6\u3002<\/p>\n<h4>2.3 \u5f3a\u5316\u5b66\u4e60&#xff08;Reinforcement Learning&#xff09;<\/h4>\n<p>\u6838\u5fc3\u7279\u5f81&#xff1a;\u57fa\u4e8e\u5956\u52b1\u673a\u5236&#xff0c;\u91c7\u53d6\u884c\u52a8\u5e8f\u5217\u4ee5\u4f18\u5316\u957f\u671f\u7ed3\u679c<\/p>\n<ul>\n<li>\u6570\u636e\u6d41&#xff1a;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         (<\/p>\n<p>          s<\/p>\n<p>          t<\/p>\n<p>         ,<\/p>\n<p>          a<\/p>\n<p>          t<\/p>\n<p>         ,<\/p>\n<p>          r<\/p>\n<p>          t<\/p>\n<p>         ,<\/p>\n<p>          s<\/p>\n<p>           t<\/p>\n<p>           &#043;<\/p>\n<p>           1<\/p>\n<p>         )<\/p>\n<p>        (s_t, a_t, r_t, s_{t&#043;1})<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">(<\/span><span class=\"mord\"><span class=\"mord mathnormal\">s<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">a<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">r<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0278em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">s<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">t<\/span><span class=\"mbin mtight\">&#043;<\/span><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2083em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span> \u72b6\u6001-\u52a8\u4f5c-\u5956\u52b1-\u4e0b\u4e00\u72b6\u6001\u5e8f\u5217<\/li>\n<li>\u76ee\u6807&#xff1a;\u6700\u5927\u5316\u7d2f\u79ef\u5956\u52b1 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         E<\/p>\n<p>         [<\/p>\n<p>          \u2211<\/p>\n<p>           t<\/p>\n<p>           &#061;<\/p>\n<p>           0<\/p>\n<p>          \u221e<\/p>\n<p>          \u03b3<\/p>\n<p>          t<\/p>\n<p>          r<\/p>\n<p>          t<\/p>\n<p>         ]<\/p>\n<p>        \\\\mathbb{E}[\\\\sum_{t&#061;0}^{\\\\infty} \\\\gamma^t r_t]<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1.104em;vertical-align: -0.2997em\"><\/span><span class=\"mord mathbb\">E<\/span><span class=\"mopen\">[<\/span><span class=\"mop\"><span class=\"mop op-symbol small-op\" style=\"position: relative;top: 0em\">\u2211<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8043em\"><span class=\"\" style=\"top: -2.4003em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">t<\/span><span class=\"mrel mtight\">&#061;<\/span><span class=\"mord mtight\">0<\/span><\/span><\/span><\/span><span class=\"\" style=\"top: -3.2029em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mtight\">\u221e<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2997em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0556em\">\u03b3<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.7936em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">r<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0278em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">]<\/span><\/span><\/span><\/span><\/span><\/li>\n<li>\u5173\u952e\u5dee\u5f02&#xff1a;\n<ul>\n<li>\u65f6\u5e8f\u4f9d\u8d56\u6027&#xff1a;\u5f53\u524d\u52a8\u4f5c\u5f71\u54cd\u672a\u6765\u72b6\u6001&#xff08;\u975e\u72ec\u7acb\u540c\u5206\u5e03&#xff09;<\/li>\n<li>\u5ef6\u8fdf\u53cd\u9988&#xff1a;\u5956\u52b1\u53ef\u80fd\u6ede\u540e\u591a\u4e2a\u65f6\u95f4\u6b65&#xff08;Credit Assignment\u95ee\u9898&#xff09;<\/li>\n<li>\u4e3b\u52a8\u63a2\u7d22&#xff1a;\u667a\u80fd\u4f53\u901a\u8fc7\u884c\u4e3a\u5f71\u54cd\u6570\u636e\u5206\u5e03<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>\u7c7b\u6bd4&#xff1a;\u5982\u540c\u5a74\u513f\u5b66\u8d70\u8def&#xff0c;\u901a\u8fc7\u8dcc\u5012&#xff08;\u8d1f\u5956\u52b1&#xff09;\u548c\u7ad9\u7a33&#xff08;\u6b63\u5956\u52b1&#xff09;\u7684\u53cd\u9988&#xff0c;\u9010\u6b65\u4f18\u5316\u52a8\u4f5c\u7b56\u7565\u3002<\/p>\n<h4>2.4 \u8303\u5f0f\u5bf9\u6bd4\u603b\u7ed3<\/h4>\n<p>\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502     \u7ef4\u5ea6        \u2502   \u76d1\u7763\u5b66\u4e60       \u2502   \u65e0\u76d1\u7763\u5b66\u4e60     \u2502   \u5f3a\u5316\u5b66\u4e60       \u2502<br \/>\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524<br \/>\n\u2502 \u6570\u636e\u5f62\u5f0f        \u2502 (\u8f93\u5165, \u6807\u7b7e)     \u2502 \u4ec5\u8f93\u5165\u6570\u636e       \u2502 (\u72b6\u6001, \u52a8\u4f5c, \u5956\u52b1)\u5e8f\u5217 \u2502<br \/>\n\u2502 \u53cd\u9988\u7c7b\u578b        \u2502 \u5373\u65f6\u3001\u660e\u786e       \u2502 \u65e0\u76f4\u63a5\u53cd\u9988       \u2502 \u5ef6\u8fdf\u3001\u7a00\u758f       \u2502<br \/>\n\u2502 \u51b3\u7b56\u6027\u8d28        \u2502 \u5355\u6b65\u9884\u6d4b         \u2502 \u6a21\u5f0f\u53d1\u73b0         \u2502 \u5e8f\u8d2f\u51b3\u7b56         \u2502<br \/>\n\u2502 \u73af\u5883\u4ea4\u4e92        \u2502 \u65e0               \u2502 \u65e0               \u2502 \u4e3b\u52a8\u3001\u52a8\u6001       \u2502<br \/>\n\u2502 \u5178\u578b\u5e94\u7528        \u2502 \u56fe\u50cf\u5206\u7c7b         \u2502 \u805a\u7c7b\u5206\u6790         \u2502 \u6e38\u620fAI\u3001\u673a\u5668\u4eba   \u2502<br \/>\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<\/p>\n<h3>\u4e09\u3001\u57fa\u4e8e\u4eba\u7c7b\u53cd\u9988\u7684\u5f3a\u5316\u5b66\u4e60&#xff08;RLHF&#xff09;&#xff1a;\u5927\u6a21\u578b\u5bf9\u9f50\u7684\u5173\u952e\u6280\u672f<\/h3>\n<h4>3.1 RLHF\u6280\u672f\u80cc\u666f\u4e0e\u6d41\u7a0b<\/h4>\n<p>RLHF&#xff08;Reinforcement Learning from Human Feedback&#xff09; \u662f\u5c06\u5f3a\u5316\u5b66\u4e60\u5e94\u7528\u4e8e\u5927\u8bed\u8a00\u6a21\u578b&#xff08;LLM&#xff09;\u5bf9\u9f50\u7684\u6838\u5fc3\u6280\u672f&#xff0c;\u88ab\u5e7f\u6cdb\u5e94\u7528\u4e8eGPT\u7cfb\u5217\u3001ChatGPT\u3001Claude\u7b49\u6a21\u578b\u7684\u8bad\u7ec3\u540e\u671f\u9636\u6bb5\u3002<\/p>\n<p>\u6807\u51c6\u4e09\u9636\u6bb5\u6d41\u7a0b&#xff1a;<\/p>\n<h5>\u9636\u6bb5\u4e00&#xff1a;\u76d1\u7763\u5fae\u8c03&#xff08;SFT, Supervised Fine-Tuning&#xff09;<\/h5>\n<ul>\n<li>\u4f7f\u7528\u9ad8\u8d28\u91cf\u4eba\u5de5\u6807\u6ce8\u6570\u636e\u5bf9\u9884\u8bad\u7ec3\u6a21\u578b\u8fdb\u884c\u521d\u6b65\u884c\u4e3a\u5851\u9020<\/li>\n<li>\u8ba9\u6a21\u578b\u5b66\u4e60\u57fa\u672c\u7684\u6307\u4ee4\u9075\u5faa\u80fd\u529b\u548c\u5bf9\u8bdd\u683c\u5f0f<\/li>\n<\/ul>\n<h5>\u9636\u6bb5\u4e8c&#xff1a;\u5956\u52b1\u6a21\u578b\u8bad\u7ec3&#xff08;Reward Model Training&#xff09;<\/h5>\n<ul>\n<li>\u6536\u96c6\u4eba\u7c7b\u504f\u597d\u6570\u636e&#xff1a;\u5bf9\u540c\u4e00\u63d0\u793a\u7684\u591a\u4e2a\u8f93\u51fa\u8fdb\u884c\u6210\u5bf9\u6bd4\u8f83&#xff08;Pairwise Comparison&#xff09;<\/li>\n<li>\u8bad\u7ec3\u5956\u52b1\u6a21\u578b <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          r<\/p>\n<p>          \u03b8<\/p>\n<p>         (<\/p>\n<p>         x<\/p>\n<p>         ,<\/p>\n<p>         y<\/p>\n<p>         )<\/p>\n<p>        r_\\\\theta(x,y)<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">r<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0278em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0278em\">\u03b8<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">y<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span> \u9884\u6d4b\u4eba\u7c7b\u504f\u597d\u5206\u6570<\/li>\n<li>\u635f\u5931\u51fd\u6570&#xff08;Bradley-Terry\u6a21\u578b&#xff09;&#xff1a;<\/li>\n<\/ul>\n<p><span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          L<\/p>\n<p>           R<\/p>\n<p>           M<\/p>\n<p>         &#061;<\/p>\n<p>         \u2212<\/p>\n<p>          E<\/p>\n<p>           (<\/p>\n<p>           x<\/p>\n<p>           ,<\/p>\n<p>            y<\/p>\n<p>            w<\/p>\n<p>           ,<\/p>\n<p>            y<\/p>\n<p>            l<\/p>\n<p>           )<\/p>\n<p>           \u223c<\/p>\n<p>           D<\/p>\n<p>          [<\/p>\n<p>          log<\/p>\n<p>          \u2061<\/p>\n<p>          \u03c3<\/p>\n<p>           (<\/p>\n<p>            r<\/p>\n<p>            \u03b8<\/p>\n<p>           (<\/p>\n<p>           x<\/p>\n<p>           ,<\/p>\n<p>            y<\/p>\n<p>            w<\/p>\n<p>           )<\/p>\n<p>           \u2212<\/p>\n<p>            r<\/p>\n<p>            \u03b8<\/p>\n<p>           (<\/p>\n<p>           x<\/p>\n<p>           ,<\/p>\n<p>            y<\/p>\n<p>            l<\/p>\n<p>           )<\/p>\n<p>           )<\/p>\n<p>          ]<\/p>\n<p>        \\\\mathcal{L}_{RM} &#061; -\\\\mathbb{E}_{(x,y_w,y_l)\\\\sim \\\\mathcal{D}} \\\\left[ \\\\log \\\\sigma \\\\left( r_\\\\theta(x,y_w) &#8211; r_\\\\theta(x,y_l) \\\\right) \\\\right]<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8333em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathcal\">L<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3283em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.109em\">RM<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1.1052em;vertical-align: -0.3552em\"><\/span><span class=\"mord\">\u2212<\/span><span class=\"mord\"><span class=\"mord mathbb\">E<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3448em\"><span class=\"\" style=\"top: -2.5198em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mopen mtight\">(<\/span><span class=\"mord mathnormal mtight\">x<\/span><span class=\"mpunct mtight\">,<\/span><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0359em\">y<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1645em\"><span class=\"\" style=\"top: -2.357em;margin-left: -0.0359em;margin-right: 0.0714em\"><span class=\"pstrut\" style=\"height: 2.5em\"><\/span><span class=\"sizing reset-size3 size1 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0269em\">w<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.143em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct mtight\">,<\/span><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0359em\">y<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3448em\"><span class=\"\" style=\"top: -2.3488em;margin-left: -0.0359em;margin-right: 0.0714em\"><span class=\"pstrut\" style=\"height: 2.5em\"><\/span><span class=\"sizing reset-size3 size1 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0197em\">l<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1512em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose mtight\">)<\/span><span class=\"mrel mtight\">\u223c<\/span><span class=\"mord mathcal mtight\" style=\"margin-right: 0.0278em\">D<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3552em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"minner\"><span class=\"mopen delimcenter\" style=\"top: 0em\">[<\/span><span class=\"mop\">lo<span style=\"margin-right: 0.0139em\">g<\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">\u03c3<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"minner\"><span class=\"mopen delimcenter\" style=\"top: 0em\">(<\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">r<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0278em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0278em\">\u03b8<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">y<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1514em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0269em\">w<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">)<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u2212<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">r<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0278em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0278em\">\u03b8<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">y<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0197em\">l<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">)<\/span><span class=\"mclose delimcenter\" style=\"top: 0em\">)<\/span><\/span><span class=\"mclose delimcenter\" style=\"top: 0em\">]<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<p>\u5176\u4e2d <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         y<\/p>\n<p>         w<\/p>\n<p>       y_w<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.625em;vertical-align: -0.1944em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">y<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1514em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0269em\">w<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u4e3a\u4eba\u7c7b\u504f\u597d\u7684&#034;\u80dc&#034;\u8f93\u51fa&#xff0c;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         y<\/p>\n<p>         l<\/p>\n<p>       y_l<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.625em;vertical-align: -0.1944em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">y<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0197em\">l<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u4e3a&#034;\u8d1f&#034;\u8f93\u51fa\u3002<\/p>\n<h5>\u9636\u6bb5\u4e09&#xff1a;\u5f3a\u5316\u5b66\u4e60\u4f18\u5316&#xff08;PPO\u4f18\u5316&#xff09;<\/h5>\n<ul>\n<li>\u4f7f\u7528\u8fd1\u7aef\u7b56\u7565\u4f18\u5316&#xff08;Proximal Policy Optimization, PPO&#xff09; \u7b97\u6cd5\u5fae\u8c03\u7b56\u7565<\/li>\n<li>\u76ee\u6807\u51fd\u6570\u5305\u542b\u5956\u52b1\u6700\u5927\u5316\u4e0eKL\u6563\u5ea6\u7ea6\u675f&#xff08;\u9632\u6b62\u6a21\u578b\u504f\u79bb\u592a\u8fdc&#xff09;&#xff1a;<\/li>\n<\/ul>\n<p><span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          L<\/p>\n<p>           P<\/p>\n<p>           P<\/p>\n<p>           O<\/p>\n<p>         &#061;<\/p>\n<p>         E<\/p>\n<p>          [<\/p>\n<p>          min<\/p>\n<p>          \u2061<\/p>\n<p>           (<\/p>\n<p>              \u03c0<\/p>\n<p>              \u03b8<\/p>\n<p>             (<\/p>\n<p>             a<\/p>\n<p>             \u2223<\/p>\n<p>             s<\/p>\n<p>             )<\/p>\n<p>              \u03c0<\/p>\n<p>               \u03b8<\/p>\n<p>                o<\/p>\n<p>                l<\/p>\n<p>                d<\/p>\n<p>             (<\/p>\n<p>             a<\/p>\n<p>             \u2223<\/p>\n<p>             s<\/p>\n<p>             )<\/p>\n<p>            A<\/p>\n<p>            t<\/p>\n<p>           ,<\/p>\n<p>           clip<\/p>\n<p>           (<\/p>\n<p>           \u22c5<\/p>\n<p>           ,<\/p>\n<p>           1<\/p>\n<p>           \u2212<\/p>\n<p>           \u03f5<\/p>\n<p>           ,<\/p>\n<p>           1<\/p>\n<p>           &#043;<\/p>\n<p>           \u03f5<\/p>\n<p>           )<\/p>\n<p>            A<\/p>\n<p>            t<\/p>\n<p>           )<\/p>\n<p>          ]<\/p>\n<p>        \\\\mathcal{L}_{PPO} &#061; \\\\mathbb{E} \\\\left[ \\\\min \\\\left( \\\\frac{\\\\pi_\\\\theta(a|s)}{\\\\pi_{\\\\theta_{old}}(a|s)} A_t, \\\\text{clip}(\\\\cdot, 1-\\\\epsilon, 1&#043;\\\\epsilon) A_t \\\\right) \\\\right]<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8333em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathcal\">L<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3283em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0278em\">PPO<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 2.4em;vertical-align: -0.95em\"><\/span><span class=\"mord mathbb\">E<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"minner\"><span class=\"mopen delimcenter\" style=\"top: 0em\"><span class=\"delimsizing size3\">[<\/span><\/span><span class=\"mop\">min<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"minner\"><span class=\"mopen delimcenter\" style=\"top: 0em\"><span class=\"delimsizing size3\">(<\/span><\/span><span class=\"mord\"><span class=\"mopen nulldelimiter\"><\/span><span class=\"mfrac\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 1.427em\"><span class=\"\" style=\"top: -2.314em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"mord\"><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">\u03c0<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0278em\">\u03b8<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3448em\"><span class=\"\" style=\"top: -2.3488em;margin-left: -0.0278em;margin-right: 0.0714em\"><span class=\"pstrut\" style=\"height: 2.5em\"><\/span><span class=\"sizing reset-size3 size1 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">o<\/span><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0197em\">l<\/span><span class=\"mord mathnormal mtight\">d<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1512em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2559em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">a<\/span><span class=\"mord\">\u2223<\/span><span class=\"mord mathnormal\">s<\/span><span class=\"mclose\">)<\/span><\/span><\/span><span class=\"\" style=\"top: -3.23em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"frac-line\" style=\"border-bottom-width: 0.04em\"><\/span><\/span><span class=\"\" style=\"top: -3.677em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"mord\"><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">\u03c0<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0278em\">\u03b8<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">a<\/span><span class=\"mord\">\u2223<\/span><span class=\"mord mathnormal\">s<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.9419em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><span class=\"mclose nulldelimiter\"><\/span><\/span><span class=\"mord\"><span class=\"mord mathnormal\">A<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord text\"><span class=\"mord\">clip<\/span><\/span><span class=\"mopen\">(<\/span><span class=\"mord\">\u22c5<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\">1<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u2212<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mord mathnormal\">\u03f5<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\">1<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">&#043;<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mord mathnormal\">\u03f5<\/span><span class=\"mclose\">)<\/span><span class=\"mord\"><span class=\"mord mathnormal\">A<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose delimcenter\" style=\"top: 0em\"><span class=\"delimsizing size3\">)<\/span><\/span><\/span><span class=\"mclose delimcenter\" style=\"top: 0em\"><span class=\"delimsizing size3\">]<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<h4>3.2 RLHF\u5728GPT\u6a21\u578b\u8bad\u7ec3\u4e2d\u7684\u5b9e\u8df5\u610f\u4e49<\/h4>\n<table>\n<tr>\u4f18\u5316\u76ee\u6807\u6280\u672f\u5b9e\u73b0\u6548\u679c<\/tr>\n<tbody>\n<tr>\n<td>\u6709\u7528\u6027&#xff08;Helpfulness&#xff09;<\/td>\n<td>\u9ad8\u5956\u52b1\u5206\u914d\u7ed9\u8be6\u7ec6\u3001\u51c6\u786e\u56de\u7b54<\/td>\n<td>\u63d0\u5347\u56de\u7b54\u8d28\u91cf\u4e0e\u4fe1\u606f\u5bc6\u5ea6<\/td>\n<\/tr>\n<tr>\n<td>\u65e0\u5bb3\u6027&#xff08;Harmlessness&#xff09;<\/td>\n<td>\u60e9\u7f5a\u6709\u5bb3\u3001\u504f\u89c1\u3001\u5371\u9669\u5185\u5bb9<\/td>\n<td>\u964d\u4f4e\u6a21\u578b\u8f93\u51fa\u98ce\u9669<\/td>\n<\/tr>\n<tr>\n<td>\u8bda\u5b9e\u6027&#xff08;Honesty&#xff09;<\/td>\n<td>\u5956\u52b1\u627f\u8ba4\u4e0d\u786e\u5b9a\u6027<\/td>\n<td>\u51cf\u5c11\u5e7b\u89c9&#xff08;Hallucination&#xff09;<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>&gt; \u5de5\u7a0b\u6d1e\u5bdf&#xff1a;RLHF\u7684\u6210\u529f\u5173\u952e\u5728\u4e8e\u5956\u52b1\u6a21\u578b\u7684\u8d28\u91cf\u3002\u82e5\u5956\u52b1\u6a21\u578b\u5b58\u5728\u504f\u89c1&#xff0c;\u5f3a\u5316\u5b66\u4e60\u4f1a\u653e\u5927\u8fd9\u79cd\u504f\u89c1&#xff08;Reward Hacking&#xff09;&#xff0c;\u56e0\u6b64\u9700\u8981\u8fed\u4ee3\u4f18\u5316\u5956\u52b1\u6a21\u578b\u4e0e\u7b56\u7565\u7684\u534f\u540c\u8bad\u7ec3\u3002<\/p>\n<hr \/>\n<h3>\u56db\u3001\u5177\u8eab\u667a\u80fd\u4e0e\u673a\u5668\u4eba\u5e94\u7528&#xff1a;RL\u7684\u7269\u7406\u4e16\u754c\u843d\u5730<\/h3>\n<h4>4.1 \u5177\u8eab\u667a\u80fd&#xff08;Embodied AI&#xff09;\u7684\u6280\u672f\u5185\u6db5<\/h4>\n<p>\u5177\u8eab\u667a\u80fd\u5f3a\u8c03\u667a\u80fd\u4f53\u901a\u8fc7\u7269\u7406\u8eab\u4f53\u4e0e\u73af\u5883\u7684\u5b9e\u65f6\u4ea4\u4e92\u83b7\u5f97\u667a\u80fd&#xff0c;\u662f\u5f3a\u5316\u5b66\u4e60\u6700\u5177\u6311\u6218\u4e5f\u6700\u5177\u524d\u666f\u7684\u5e94\u7528\u9886\u57df\u3002\u4e0e\u7eaf\u8f6f\u4ef6AI\u4e0d\u540c&#xff0c;\u5177\u8eab\u667a\u80fd\u9762\u4e34&#xff1a;<\/p>\n<ul>\n<li>\u90e8\u5206\u53ef\u89c2\u6d4b\u6027&#xff08;Partial Observability&#xff09;&#xff1a;\u4f20\u611f\u5668\u566a\u58f0\u4e0e\u89c6\u91ce\u9650\u5236<\/li>\n<li>\u9ad8\u7ef4\u8fde\u7eed\u52a8\u4f5c\u7a7a\u95f4&#xff1a;\u673a\u68b0\u81c2\u5173\u8282\u63a7\u5236\u3001\u5e95\u76d8\u901f\u5ea6\u8c03\u8282<\/li>\n<li>\u6837\u672c\u6548\u7387\u5371\u673a&#xff1a;\u771f\u5b9e\u73af\u5883\u4ea4\u4e92\u6210\u672c\u6781\u9ad8<\/li>\n<\/ul>\n<h4>4.2 \u6838\u5fc3\u5e94\u7528\u573a\u666f\u4e0e\u6280\u672f\u65b9\u6848<\/h4>\n<h5>4.2.1 \u673a\u5668\u4eba\u8fd0\u52a8\u63a7\u5236\u4e0e\u73af\u5883\u611f\u77e5<\/h5>\n<ul>\n<li>\u6311\u6218&#xff1a;\u5e73\u8861\u52a8\u6001\u7a33\u5b9a\u6027\u3001\u80fd\u8017\u4f18\u5316\u4e0e\u4efb\u52a1\u5b8c\u6210\u5ea6<\/li>\n<li>\u65b9\u6848&#xff1a;\n<ul>\n<li>\u5206\u5c42\u5f3a\u5316\u5b66\u4e60&#xff08;Hierarchical RL&#xff09;&#xff1a;\u9ad8\u5c42\u7b56\u7565\u89c4\u5212\u8def\u5f84&#xff0c;\u4f4e\u5c42\u7b56\u7565\u63a7\u5236\u6267\u884c<\/li>\n<li>Sim-to-Real\u8fc1\u79fb&#xff1a;\u5728\u4eff\u771f\u73af\u5883&#xff08;MuJoCo\u3001Isaac Gym&#xff09;\u8bad\u7ec3&#xff0c;\u901a\u8fc7\u57df\u968f\u673a\u5316&#xff08;Domain Randomization&#xff09; \u63d0\u5347\u771f\u5b9e\u573a\u666f\u6cdb\u5316\u6027<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h5>4.2.2 \u89c6\u89c9\u8bc6\u522b\u4e0e\u7269\u4f53\u6293\u53d6<\/h5>\n<ul>\n<li>\u6311\u6218&#xff1a;\u5149\u7167\u53d8\u5316\u3001\u7269\u4f53\u5f62\u72b6\u591a\u6837\u6027\u3001\u906e\u6321\u5904\u7406<\/li>\n<li>\u65b9\u6848&#xff1a;\n<ul>\n<li>\u89c6\u89c9-\u52a8\u4f5c\u8054\u5408\u5b66\u4e60&#xff1a;CNN\u63d0\u53d6\u89c6\u89c9\u7279\u5f81&#xff0c;\u878d\u5408\u5230\u7b56\u7565\u7f51\u7edc<\/li>\n<li>\u6293\u53d6\u59ff\u6001\u68c0\u6d4b&#xff08;Grasp Pose Detection&#xff09;&#xff1a;\u7ed3\u54086D\u4f4d\u59ff\u4f30\u8ba1\u4e0eRL\u4f18\u5316\u6293\u53d6\u7b56\u7565<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h5>4.2.3 \u865a\u62df\u4eff\u771f\u5e73\u53f0\u8bad\u7ec3<\/h5>\n<p>\u4e3b\u6d41\u4eff\u771f\u5e73\u53f0\u5bf9\u6bd4&#xff1a;<\/p>\n<table>\n<tr>\u5e73\u53f0\u7269\u7406\u5f15\u64ce\u7279\u70b9\u9002\u7528\u573a\u666f<\/tr>\n<tbody>\n<tr>\n<td>Gym\/Gymnasium<\/td>\n<td>\u591a\u79cd<\/td>\n<td>OpenAI\u6807\u51c6\u63a5\u53e3&#xff0c;\u751f\u6001\u4e30\u5bcc<\/td>\n<td>\u7b97\u6cd5\u9a8c\u8bc1\u3001\u5165\u95e8\u5b66\u4e60<\/td>\n<\/tr>\n<tr>\n<td>MuJoCo<\/td>\n<td>MuJoCo<\/td>\n<td>\u9ad8\u7cbe\u5ea6\u63a5\u89e6\u7269\u7406&#xff0c;\u8f7b\u91cf\u7ea7<\/td>\n<td>\u7075\u5de7\u64cd\u4f5c\u3001\u6b65\u6001\u63a7\u5236<\/td>\n<\/tr>\n<tr>\n<td>Isaac Gym<\/td>\n<td>PhysX<\/td>\n<td>GPU\u5e76\u884c\u52a0\u901f&#xff0c;\u5927\u89c4\u6a21\u573a\u666f<\/td>\n<td>\u5e76\u884c\u8bad\u7ec3\u3001Sim-to-Real<\/td>\n<\/tr>\n<tr>\n<td>PyBullet<\/td>\n<td>Bullet<\/td>\n<td>\u5f00\u6e90\u514d\u8d39&#xff0c;\u6613\u90e8\u7f72<\/td>\n<td>\u5feb\u901f\u539f\u578b\u3001\u6559\u80b2\u6f14\u793a<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h5>4.2.4 \u5de5\u4e1a\u81ea\u52a8\u5316\u4e0e\u751f\u4ea7<\/h5>\n<ul>\n<li>\u5e94\u7528&#xff1a;\u67d4\u6027\u5236\u9020\u4e2d\u7684\u88c5\u914d\u987a\u5e8f\u4f18\u5316\u3001AGV\u8def\u5f84\u89c4\u5212\u3001\u8d28\u91cf\u68c0\u6d4b\u7b56\u7565<\/li>\n<li>\u4ef7\u503c&#xff1a;\u9002\u5e94\u5c0f\u6279\u91cf\u591a\u54c1\u79cd\u751f\u4ea7&#xff0c;\u66ff\u4ee3\u786c\u7f16\u7801\u89c4\u5219&#xff0c;\u63d0\u5347\u4ea7\u7ebf\u67d4\u6027<\/li>\n<\/ul>\n<hr \/>\n<h3>\u4e94\u3001\u5f3a\u5316\u5b66\u4e60\u6848\u4f8b\u5206\u6790&#xff1a;\u4ece\u4eff\u771f\u5230\u771f\u5b9e\u673a\u5668\u4eba<\/h3>\n<h4>5.1 \u6848\u4f8b\u4e00&#xff1a;\u5939\u722a\u673a\u5668\u4eba\u6293\u53d6&#xff08;Gripper Grasping&#xff09;<\/h4>\n<h5>\u4efb\u52a1\u5b9a\u4e49<\/h5>\n<p>\u8bad\u7ec3\u673a\u68b0\u81c2\u5939\u722a\u5728\u6742\u4e71\u573a\u666f\u4e2d\u6293\u53d6\u76ee\u6807\u7269\u4f53\u5e76\u653e\u7f6e\u5230\u6307\u5b9a\u4f4d\u7f6e\u3002<\/p>\n<h5>\u72b6\u6001\u7a7a\u95f4&#xff08;State Space&#xff09;\u8bbe\u8ba1<\/h5>\n<ul>\n<li>\u89c6\u89c9\u8f93\u5165&#xff1a;RGB-D\u76f8\u673a\u56fe\u50cf&#xff08;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         224<\/p>\n<p>         \u00d7<\/p>\n<p>         224<\/p>\n<p>         \u00d7<\/p>\n<p>         4<\/p>\n<p>        224 \\\\times 224 \\\\times 4<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">224<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">224<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6444em\"><\/span><span class=\"mord\">4<\/span><\/span><\/span><\/span><\/span>&#xff09;<\/li>\n<li>\u672c\u4f53\u611f\u77e5&#xff1a;\u5939\u722a\u5f00\u5408\u89d2\u5ea6\u3001\u6307\u5c16\u529b\u77e9\u4f20\u611f\u5668\u8bfb\u6570<\/li>\n<li>\u4efb\u52a1\u72b6\u6001&#xff1a;\u76ee\u6807\u7269\u4f53\u4f4d\u59ff&#xff08;\u82e5\u53ef\u83b7\u53d6&#xff09;\u3001\u653e\u7f6e\u533a\u57df\u4f4d\u7f6e<\/li>\n<\/ul>\n<h5>\u52a8\u4f5c\u7a7a\u95f4&#xff08;Action Space&#xff09;\u8bbe\u8ba1<\/h5>\n<ul>\n<li>\u4f4d\u7f6e\u63a7\u5236&#xff1a;\u672b\u7aef\u6267\u884c\u5668\u5728XYZ\u5750\u6807\u7cfb\u7684\u4f4d\u79fb\u589e\u91cf <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         (<\/p>\n<p>         \u0394<\/p>\n<p>         x<\/p>\n<p>         ,<\/p>\n<p>         \u0394<\/p>\n<p>         y<\/p>\n<p>         ,<\/p>\n<p>         \u0394<\/p>\n<p>         z<\/p>\n<p>         )<\/p>\n<p>        (\\\\Delta x, \\\\Delta y, \\\\Delta z)<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">(<\/span><span class=\"mord\">\u0394<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\">\u0394<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">y<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\">\u0394<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.044em\">z<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span><\/li>\n<li>\u59ff\u6001\u63a7\u5236&#xff1a;\u5939\u722a\u7ed5Z\u8f74\u65cb\u8f6c\u89d2\u5ea6 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         \u0394<\/p>\n<p>         \u03b8<\/p>\n<p>        \\\\Delta \\\\theta<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6944em\"><\/span><span class=\"mord\">\u0394<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">\u03b8<\/span><\/span><\/span><\/span><\/span><\/li>\n<li>\u5939\u722a\u63a7\u5236&#xff1a;\u5f00\u5408\u6307\u4ee4&#xff08;\u79bb\u6563\u6216\u8fde\u7eed\u503c&#xff09;<\/li>\n<\/ul>\n<p><span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          a<\/p>\n<p>          t<\/p>\n<p>         &#061;<\/p>\n<p>         [<\/p>\n<p>         \u0394<\/p>\n<p>         x<\/p>\n<p>         ,<\/p>\n<p>         \u0394<\/p>\n<p>         y<\/p>\n<p>         ,<\/p>\n<p>         \u0394<\/p>\n<p>         z<\/p>\n<p>         ,<\/p>\n<p>         \u0394<\/p>\n<p>         \u03b8<\/p>\n<p>         ,<\/p>\n<p>         gripper_cmd<\/p>\n<p>         ]<\/p>\n<p>         \u2208<\/p>\n<p>          R<\/p>\n<p>          5<\/p>\n<p>        \\\\mathbf{a}_t &#061; [\\\\Delta x, \\\\Delta y, \\\\Delta z, \\\\Delta \\\\theta, \\\\text{gripper\\\\_cmd}] \\\\in \\\\mathbb{R}^5<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.5944em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathbf\">a<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1.06em;vertical-align: -0.31em\"><\/span><span class=\"mopen\">[<\/span><span class=\"mord\">\u0394<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\">\u0394<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">y<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\">\u0394<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.044em\">z<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\">\u0394<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">\u03b8<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord text\"><span class=\"mord\">gripper_cmd<\/span><\/span><span class=\"mclose\">]<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">\u2208<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8641em\"><\/span><span class=\"mord\"><span class=\"mord mathbb\">R<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8641em\"><span class=\"\" style=\"top: -3.113em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">5<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<h5>\u5956\u52b1\u51fd\u6570&#xff08;Reward Shaping&#xff09;\u8bbe\u8ba1<\/h5>\n<p>\u91c7\u7528\u7a00\u758f\u5956\u52b1\u4e0e\u5bc6\u96c6\u5956\u52b1\u7ed3\u5408\u7b56\u7565&#xff1a;<\/p>\n<p><span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          r<\/p>\n<p>          t<\/p>\n<p>         &#061;<\/p>\n<p>          r<\/p>\n<p>          sparse<\/p>\n<p>         &#043;<\/p>\n<p>          r<\/p>\n<p>          dense<\/p>\n<p>        r_t &#061; r_{\\\\text{sparse}} &#043; r_{\\\\text{dense}}<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.5806em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">r<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0278em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8694em;vertical-align: -0.2861em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">r<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1514em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0278em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord text mtight\"><span class=\"mord mtight\">sparse<\/span><\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2861em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">&#043;<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.5806em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">r<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0278em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord text mtight\"><span class=\"mord mtight\">dense<\/span><\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<ul>\n<li>\n<p>\u7a00\u758f\u5956\u52b1&#xff1a;<\/p>\n<ul>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>           &#043;<\/p>\n<p>           10<\/p>\n<p>          &#043;10<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">&#043;<\/span><span class=\"mord\">10<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u6210\u529f\u6293\u53d6\u5e76\u653e\u7f6e<\/li>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>           \u2212<\/p>\n<p>           5<\/p>\n<p>          -5<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">\u2212<\/span><span class=\"mord\">5<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u6389\u843d\u6216\u78b0\u649e<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>\u5bc6\u96c6\u5956\u52b1&#xff08;\u5f15\u5bfc\u5b66\u4e60&#xff09;&#xff1a;<\/p>\n<ul>\n<li>\u63a5\u8fd1\u76ee\u6807&#xff1a;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>           \u2212<\/p>\n<p>            \u03bb<\/p>\n<p>            1<\/p>\n<p>           \u22c5<\/p>\n<p>           distance<\/p>\n<p>           (<\/p>\n<p>           gripper<\/p>\n<p>           ,<\/p>\n<p>           object<\/p>\n<p>           )<\/p>\n<p>          &#8211; \\\\lambda_1 \\\\cdot \\\\text{distance}(\\\\text{gripper}, \\\\text{object})<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\">\u2212<\/span><span class=\"mord\"><span class=\"mord mathnormal\">\u03bb<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u22c5<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord text\"><span class=\"mord\">distance<\/span><\/span><span class=\"mopen\">(<\/span><span class=\"mord text\"><span class=\"mord\">gripper<\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord text\"><span class=\"mord\">object<\/span><\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span><\/li>\n<li>\u5bf9\u9f50\u59ff\u6001&#xff1a;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>           \u2212<\/p>\n<p>            \u03bb<\/p>\n<p>            2<\/p>\n<p>           \u22c5<\/p>\n<p>           orientation_error<\/p>\n<p>          &#8211; \\\\lambda_2 \\\\cdot \\\\text{orientation\\\\_error}<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\">\u2212<\/span><span class=\"mord\"><span class=\"mord mathnormal\">\u03bb<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u22c5<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.9779em;vertical-align: -0.31em\"><\/span><span class=\"mord text\"><span class=\"mord\">orientation_error<\/span><\/span><\/span><\/span><\/span><\/span><\/li>\n<li>\u4fdd\u6301\u6293\u53d6&#xff1a;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>           &#043;<\/p>\n<p>            \u03bb<\/p>\n<p>            3<\/p>\n<p>           \u22c5<\/p>\n<p>           grip_force<\/p>\n<p>          &#043; \\\\lambda_3 \\\\cdot \\\\text{grip\\\\_force}<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\">&#043;<\/span><span class=\"mord\"><span class=\"mord mathnormal\">\u03bb<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">3<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u22c5<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1.0044em;vertical-align: -0.31em\"><\/span><span class=\"mord text\"><span class=\"mord\">grip_force<\/span><\/span><\/span><\/span><\/span><\/span>&#xff08;\u5f53\u63a5\u89e6\u7269\u4f53\u65f6&#xff09;<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h4>5.2 \u6848\u4f8b\u4e8c&#xff1a;\u673a\u5668\u4eba\u5e95\u76d8\u63a7\u5236&#xff08;Mobile Robot Navigation&#xff09;<\/h4>\n<h5>\u4efb\u52a1\u5b9a\u4e49<\/h5>\n<p>\u8f6e\u5f0f\u673a\u5668\u4eba\u5728\u52a8\u6001\u73af\u5883\u4e2d\u81ea\u4e3b\u5bfc\u822a\u81f3\u76ee\u6807\u70b9&#xff0c;\u540c\u65f6\u907f\u969c\u5e76\u6ee1\u8db3\u8fd0\u52a8\u5b66\u7ea6\u675f\u3002<\/p>\n<h5>\u72b6\u6001\u7a7a\u95f4<\/h5>\n<ul>\n<li>\u6fc0\u5149\u96f7\u8fbe&#xff1a;360\u5ea6\u8ddd\u79bb\u626b\u63cf&#xff08;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         L<\/p>\n<p>         &#061;<\/p>\n<p>         {<\/p>\n<p>          d<\/p>\n<p>          1<\/p>\n<p>         ,<\/p>\n<p>          d<\/p>\n<p>          2<\/p>\n<p>         ,<\/p>\n<p>         .<\/p>\n<p>         .<\/p>\n<p>         .<\/p>\n<p>         ,<\/p>\n<p>          d<\/p>\n<p>          360<\/p>\n<p>         }<\/p>\n<p>        L &#061; \\\\{d_1, d_2, &#8230;, d_{360}\\\\}<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\">L<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">{<\/span><span class=\"mord\"><span class=\"mord mathnormal\">d<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">d<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\">&#8230;<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">d<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mtight\">360<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">}<\/span><\/span><\/span><\/span><\/span>&#xff09;<\/li>\n<li>\u91cc\u7a0b\u8ba1&#xff1a;\u5f53\u524d\u4f4d\u7f6e <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         (<\/p>\n<p>         x<\/p>\n<p>         ,<\/p>\n<p>         y<\/p>\n<p>         )<\/p>\n<p>        (x,y)<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">y<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span>\u3001\u671d\u5411 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         \u03b8<\/p>\n<p>        \\\\theta<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6944em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">\u03b8<\/span><\/span><\/span><\/span><\/span>\u3001\u7ebf\u901f\u5ea6 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         v<\/p>\n<p>        v<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.4306em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">v<\/span><\/span><\/span><\/span><\/span>\u3001\u89d2\u901f\u5ea6 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         \u03c9<\/p>\n<p>        \\\\omega<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.4306em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">\u03c9<\/span><\/span><\/span><\/span><\/span><\/li>\n<li>\u76ee\u6807\u4fe1\u606f&#xff1a;\u76f8\u5bf9\u76ee\u6807\u70b9\u7684\u8ddd\u79bb\u4e0e\u65b9\u4f4d\u89d2<\/li>\n<\/ul>\n<h5>\u52a8\u4f5c\u7a7a\u95f4<\/h5>\n<ul>\n<li>\u7ebf\u901f\u5ea6&#xff1a;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         v<\/p>\n<p>         \u2208<\/p>\n<p>         [<\/p>\n<p>         0<\/p>\n<p>         ,<\/p>\n<p>          v<\/p>\n<p>           m<\/p>\n<p>           a<\/p>\n<p>           x<\/p>\n<p>         ]<\/p>\n<p>        v \\\\in [0, v_{max}]<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.5782em;vertical-align: -0.0391em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">v<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">\u2208<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">[<\/span><span class=\"mord\">0<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">v<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1514em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">ma<\/span><span class=\"mord mathnormal mtight\">x<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">]<\/span><\/span><\/span><\/span><\/span>&#xff08;\u524d\u8fdb\u901f\u5ea6&#xff09;<\/li>\n<li>\u89d2\u901f\u5ea6&#xff1a;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         \u03c9<\/p>\n<p>         \u2208<\/p>\n<p>         [<\/p>\n<p>         \u2212<\/p>\n<p>          \u03c9<\/p>\n<p>           m<\/p>\n<p>           a<\/p>\n<p>           x<\/p>\n<p>         ,<\/p>\n<p>          \u03c9<\/p>\n<p>           m<\/p>\n<p>           a<\/p>\n<p>           x<\/p>\n<p>         ]<\/p>\n<p>        \\\\omega \\\\in [-\\\\omega_{max}, \\\\omega_{max}]<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.5782em;vertical-align: -0.0391em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">\u03c9<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">\u2208<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">[<\/span><span class=\"mord\">\u2212<\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">\u03c9<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1514em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">ma<\/span><span class=\"mord mathnormal mtight\">x<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">\u03c9<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1514em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">ma<\/span><span class=\"mord mathnormal mtight\">x<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">]<\/span><\/span><\/span><\/span><\/span>&#xff08;\u8f6c\u5411\u901f\u5ea6&#xff09;<\/li>\n<\/ul>\n<p><span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          a<\/p>\n<p>          t<\/p>\n<p>         &#061;<\/p>\n<p>         [<\/p>\n<p>         v<\/p>\n<p>         ,<\/p>\n<p>         \u03c9<\/p>\n<p>         ]<\/p>\n<p>         \u2208<\/p>\n<p>          R<\/p>\n<p>          2<\/p>\n<p>        \\\\mathbf{a}_t &#061; [v, \\\\omega] \\\\in \\\\mathbb{R}^2<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.5944em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathbf\">a<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">[<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">v<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">\u03c9<\/span><span class=\"mclose\">]<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">\u2208<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8641em\"><\/span><span class=\"mord\"><span class=\"mord mathbb\">R<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8641em\"><span class=\"\" style=\"top: -3.113em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<h5>\u5956\u52b1\u4e0e\u60e9\u7f5a\u673a\u5236<\/h5>\n<table>\n<tr>\u4e8b\u4ef6\u5956\u52b1\u503c\u8bbe\u8ba1\u610f\u56fe<\/tr>\n<tbody>\n<tr>\n<td>\u5230\u8fbe\u76ee\u6807<\/td>\n<td><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           &#043;<\/p>\n<p>           100<\/p>\n<p>          &#043;100<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">&#043;<\/span><span class=\"mord\">100<\/span><\/span><\/span><\/span><\/span><\/td>\n<td>\u4efb\u52a1\u5b8c\u6210\u6fc0\u52b1<\/td>\n<\/tr>\n<tr>\n<td>\u78b0\u649e<\/td>\n<td><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           \u2212<\/p>\n<p>           50<\/p>\n<p>          -50<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">\u2212<\/span><span class=\"mord\">50<\/span><\/span><\/span><\/span><\/span><\/td>\n<td>\u5b89\u5168\u7ea6\u675f<\/td>\n<\/tr>\n<tr>\n<td>\u8d85\u65f6<\/td>\n<td><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           \u2212<\/p>\n<p>           20<\/p>\n<p>          -20<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">\u2212<\/span><span class=\"mord\">20<\/span><\/span><\/span><\/span><\/span><\/td>\n<td>\u6548\u7387\u7ea6\u675f<\/td>\n<\/tr>\n<tr>\n<td>\u63a5\u8fd1\u76ee\u6807<\/td>\n<td><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           &#043;<\/p>\n<p>           \u0394<\/p>\n<p>            d<\/p>\n<p>             t<\/p>\n<p>             a<\/p>\n<p>             r<\/p>\n<p>             g<\/p>\n<p>             e<\/p>\n<p>             t<\/p>\n<p>          &#043; \\\\Delta d_{target}<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.9805em;vertical-align: -0.2861em\"><\/span><span class=\"mord\">&#043;<\/span><span class=\"mord\">\u0394<\/span><span class=\"mord\"><span class=\"mord mathnormal\">d<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">t<\/span><span class=\"mord mathnormal mtight\">a<\/span><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0278em\">r<\/span><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0359em\">g<\/span><span class=\"mord mathnormal mtight\">e<\/span><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2861em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/td>\n<td>\u8fdb\u5ea6\u5f15\u5bfc<\/td>\n<\/tr>\n<tr>\n<td>\u8d85\u901f<\/td>\n<td><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           \u2212<\/p>\n<p>           \u03b1<\/p>\n<p>           \u22c5<\/p>\n<p>           (<\/p>\n<p>           v<\/p>\n<p>           \u2212<\/p>\n<p>            v<\/p>\n<p>             l<\/p>\n<p>             i<\/p>\n<p>             m<\/p>\n<p>             i<\/p>\n<p>             t<\/p>\n<p>            )<\/p>\n<p>            2<\/p>\n<p>          &#8211; \\\\alpha \\\\cdot (v &#8211; v_{limit})^2<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6667em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">\u2212<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0037em\">\u03b1<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u22c5<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">v<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u2212<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1.0641em;vertical-align: -0.25em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">v<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0197em\">l<\/span><span class=\"mord mathnormal mtight\">imi<\/span><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\"><span class=\"mclose\">)<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8141em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/td>\n<td>\u8fd0\u52a8\u5b66\u7ea6\u675f<\/td>\n<\/tr>\n<tr>\n<td>\u9760\u8fd1\u969c\u788d\u7269<\/td>\n<td><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           \u2212<\/p>\n<p>           \u03b2<\/p>\n<p>           \u22c5<\/p>\n<p>            1<\/p>\n<p>             d<\/p>\n<p>              o<\/p>\n<p>              b<\/p>\n<p>              s<\/p>\n<p>              t<\/p>\n<p>              a<\/p>\n<p>              c<\/p>\n<p>              l<\/p>\n<p>              e<\/p>\n<p>          &#8211; \\\\beta \\\\cdot \\\\frac{1}{d_{obstacle}}<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8889em;vertical-align: -0.1944em\"><\/span><span class=\"mord\">\u2212<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0528em\">\u03b2<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u22c5<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1.296em;vertical-align: -0.4509em\"><\/span><span class=\"mord\"><span class=\"mopen nulldelimiter\"><\/span><span class=\"mfrac\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8451em\"><span class=\"\" style=\"top: -2.655em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">d<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3448em\"><span class=\"\" style=\"top: -2.3488em;margin-left: 0em;margin-right: 0.0714em\"><span class=\"pstrut\" style=\"height: 2.5em\"><\/span><span class=\"sizing reset-size3 size1 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">o<\/span><span class=\"mord mathnormal mtight\">b<\/span><span class=\"mord mathnormal mtight\">s<\/span><span class=\"mord mathnormal mtight\">t<\/span><span class=\"mord mathnormal mtight\">a<\/span><span class=\"mord mathnormal mtight\">c<\/span><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0197em\">l<\/span><span class=\"mord mathnormal mtight\">e<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1512em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"\" style=\"top: -3.23em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"frac-line\" style=\"border-bottom-width: 0.04em\"><\/span><\/span><span class=\"\" style=\"top: -3.394em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.4509em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><span class=\"mclose nulldelimiter\"><\/span><\/span><\/span><\/span><\/span><\/span><\/td>\n<td>\u907f\u969c\u5f15\u5bfc<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h3>\u516d\u3001\u5f3a\u5316\u5b66\u4e60\u7684\u4ee3\u7801\u5b9e\u73b0\u4e0e\u5f00\u6e90\u8d44\u6e90<\/h3>\n<h4>6.1 \u6838\u5fc3\u5de5\u5177\u5e93&#xff1a;Stable Baselines3 &#043; Gymnasium<\/h4>\n<p>Gymnasium&#xff08;\u539fOpenAI Gym&#xff09; \u63d0\u4f9b\u6807\u51c6\u5316\u7684\u73af\u5883\u63a5\u53e3&#xff08;Env API&#xff09;&#xff0c;\u662fRL\u7b97\u6cd5\u7684\u901a\u7528\u6d4b\u8bd5\u5e73\u53f0\u3002<\/p>\n<p><span class=\"token comment\"># \u73af\u5883\u63a5\u53e3\u6807\u51c6<\/span><br \/>\n<span class=\"token keyword\">import<\/span> gymnasium <span class=\"token keyword\">as<\/span> gym<\/p>\n<p>env <span class=\"token operator\">&#061;<\/span> gym<span class=\"token punctuation\">.<\/span>make<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#039;CartPole-v1&#039;<\/span><span class=\"token punctuation\">)<\/span><br \/>\nobservation<span class=\"token punctuation\">,<\/span> info <span class=\"token operator\">&#061;<\/span> env<span class=\"token punctuation\">.<\/span>reset<span class=\"token punctuation\">(<\/span>seed<span class=\"token operator\">&#061;<\/span><span class=\"token number\">42<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token keyword\">for<\/span> _ <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">range<\/span><span class=\"token punctuation\">(<\/span><span class=\"token number\">1000<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    action <span class=\"token operator\">&#061;<\/span> env<span class=\"token punctuation\">.<\/span>action_space<span class=\"token punctuation\">.<\/span>sample<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u968f\u673a\u7b56\u7565<\/span><br \/>\n    observation<span class=\"token punctuation\">,<\/span> reward<span class=\"token punctuation\">,<\/span> terminated<span class=\"token punctuation\">,<\/span> truncated<span class=\"token punctuation\">,<\/span> info <span class=\"token operator\">&#061;<\/span> env<span class=\"token punctuation\">.<\/span>step<span class=\"token punctuation\">(<\/span>action<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">if<\/span> terminated <span class=\"token keyword\">or<\/span> truncated<span class=\"token punctuation\">:<\/span><br \/>\n        observation<span class=\"token punctuation\">,<\/span> info <span class=\"token operator\">&#061;<\/span> env<span class=\"token punctuation\">.<\/span>reset<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>env<span class=\"token punctuation\">.<\/span>close<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<ul>\n<li>\u5173\u952e\u6982\u5ff5&#xff1a;\n<ul>\n<li>reset()&#xff1a;\u521d\u59cb\u5316\u73af\u5883&#xff0c;\u8fd4\u56de\u521d\u59cb\u72b6\u6001<\/li>\n<li>step(action)&#xff1a;\u6267\u884c\u52a8\u4f5c&#xff0c;\u8fd4\u56de (next_state, reward, done, info)<\/li>\n<li>action_space \/ observation_space&#xff1a;\u52a8\u4f5c\u4e0e\u72b6\u6001\u7684\u8fb9\u754c\u5b9a\u4e49&#xff08;Box\u3001Discrete\u7b49&#xff09;<\/li>\n<\/ul>\n<\/li>\n<li>Stable Baselines3&#xff08;SB3&#xff09; \u57fa\u4e8ePyTorch\u7684\u9ad8\u8d28\u91cfRL\u7b97\u6cd5\u5b9e\u73b0\u5e93&#xff0c;\u63d0\u4f9b\u6a21\u5757\u5316\u3001\u53ef\u6269\u5c55\u7684\u7b97\u6cd5\u7ec4\u4ef6\u3002<\/li>\n<li>\u652f\u6301\u7b97\u6cd5&#xff1a;<\/li>\n<\/ul>\n<table>\n<tr>\u7b97\u6cd5\u7c7b\u578b\u7279\u70b9\u9002\u7528\u573a\u666f<\/tr>\n<tbody>\n<tr>\n<td>DQN<\/td>\n<td>Value-Based<\/td>\n<td>\u7ecf\u9a8c\u56de\u653e\u3001\u76ee\u6807\u7f51\u7edc<\/td>\n<td>\u79bb\u6563\u52a8\u4f5c\u3001Atari\u6e38\u620f<\/td>\n<\/tr>\n<tr>\n<td>A2C\/A3C<\/td>\n<td>Actor-Critic<\/td>\n<td>\u540c\u6b65\/\u5f02\u6b65\u5e76\u884c\u8bad\u7ec3<\/td>\n<td>\u7b80\u5355\u8fde\u7eed\u63a7\u5236<\/td>\n<\/tr>\n<tr>\n<td>PPO<\/td>\n<td>On-Policy AC<\/td>\n<td>\u88c1\u526a\u76ee\u6807\u3001\u7a33\u5b9a\u9ad8\u6548<\/td>\n<td>\u901a\u7528\u9996\u9009&#xff0c;\u63a8\u8350\u5165\u95e8<\/td>\n<\/tr>\n<tr>\n<td>SAC<\/td>\n<td>Off-Policy AC<\/td>\n<td>\u6700\u5927\u71b5\u3001\u81ea\u52a8\u6e29\u5ea6\u8c03\u8282<\/td>\n<td>\u8fde\u7eed\u63a7\u5236\u3001\u6837\u672c\u9ad8\u6548<\/td>\n<\/tr>\n<tr>\n<td>TD3<\/td>\n<td>Off-Policy AC<\/td>\n<td>\u53ccQ\u7f51\u7edc\u3001\u5ef6\u8fdf\u7b56\u7565\u66f4\u65b0<\/td>\n<td>\u8fde\u7eed\u63a7\u5236\u3001\u89e3\u51b3\u8fc7\u4f30\u8ba1<\/td>\n<\/tr>\n<tr>\n<td>DDPG<\/td>\n<td>Off-Policy AC<\/td>\n<td>\u786e\u5b9a\u6027\u7b56\u7565\u68af\u5ea6<\/td>\n<td>\u8fde\u7eed\u63a7\u5236&#xff08;\u5df2\u88abSAC\/TD3\u8d85\u8d8a&#xff09;<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h4>6.2 \u5b9e\u6218\u4ee3\u7801\u6846\u67b6<\/h4>\n<ul>\n<li>\u793a\u4f8b&#xff1a;\u5012\u7acb\u6446\u63a7\u5236&#xff08;CartPole&#xff09;\u4e0ePPO\u8bad\u7ec3<\/li>\n<\/ul>\n<p><span class=\"token keyword\">import<\/span> gymnasium <span class=\"token keyword\">as<\/span> gym<br \/>\n<span class=\"token keyword\">from<\/span> stable_baselines3 <span class=\"token keyword\">import<\/span> PPO<br \/>\n<span class=\"token keyword\">from<\/span> stable_baselines3<span class=\"token punctuation\">.<\/span>common<span class=\"token punctuation\">.<\/span>evaluation <span class=\"token keyword\">import<\/span> evaluate_policy<\/p>\n<p><span class=\"token comment\"># 1. \u521b\u5efa\u73af\u5883<\/span><br \/>\nenv <span class=\"token operator\">&#061;<\/span> gym<span class=\"token punctuation\">.<\/span>make<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#039;CartPole-v1&#039;<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># 2. \u521d\u59cb\u5316PPO\u6a21\u578b<\/span><br \/>\n<span class=\"token comment\"># MlpPolicy&#xff1a;\u591a\u5c42\u611f\u77e5\u673a\u7b56\u7565\u7f51\u7edc<\/span><br \/>\n<span class=\"token comment\"># verbose&#061;1&#xff1a;\u6253\u5370\u8bad\u7ec3\u65e5\u5fd7<\/span><br \/>\nmodel <span class=\"token operator\">&#061;<\/span> PPO<span class=\"token punctuation\">(<\/span><br \/>\n    <span class=\"token string\">&#034;MlpPolicy&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    env<span class=\"token punctuation\">,<\/span><br \/>\n    verbose<span class=\"token operator\">&#061;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    learning_rate<span class=\"token operator\">&#061;<\/span><span class=\"token number\">3e-4<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    n_steps<span class=\"token operator\">&#061;<\/span><span class=\"token number\">2048<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    batch_size<span class=\"token operator\">&#061;<\/span><span class=\"token number\">64<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    n_epochs<span class=\"token operator\">&#061;<\/span><span class=\"token number\">10<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    gamma<span class=\"token operator\">&#061;<\/span><span class=\"token number\">0.99<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    gae_lambda<span class=\"token operator\">&#061;<\/span><span class=\"token number\">0.95<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    clip_range<span class=\"token operator\">&#061;<\/span><span class=\"token number\">0.2<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    tensorboard_log<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;.\/ppo_cartpole_tensorboard\/&#034;<\/span><br \/>\n<span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># 3. \u8bad\u7ec3\u6a21\u578b<\/span><br \/>\n<span class=\"token comment\"># total_timesteps&#xff1a;\u603b\u4ea4\u4e92\u6b65\u6570<\/span><br \/>\nmodel<span class=\"token punctuation\">.<\/span>learn<span class=\"token punctuation\">(<\/span>total_timesteps<span class=\"token operator\">&#061;<\/span><span class=\"token number\">100000<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># 4. \u4fdd\u5b58\u6a21\u578b<\/span><br \/>\nmodel<span class=\"token punctuation\">.<\/span>save<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;ppo_cartpole&#034;<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># 5. \u8bc4\u4f30\u6027\u80fd<\/span><br \/>\nmean_reward<span class=\"token punctuation\">,<\/span> std_reward <span class=\"token operator\">&#061;<\/span> evaluate_policy<span class=\"token punctuation\">(<\/span>model<span class=\"token punctuation\">,<\/span> env<span class=\"token punctuation\">,<\/span> n_eval_episodes<span class=\"token operator\">&#061;<\/span><span class=\"token number\">10<\/span><span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;Mean reward: <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>mean_reward<span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.2f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\"> &#043;\/- <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>std_reward<span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.2f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># 6. \u90e8\u7f72\u63a8\u7406<\/span><br \/>\nobs<span class=\"token punctuation\">,<\/span> info <span class=\"token operator\">&#061;<\/span> env<span class=\"token punctuation\">.<\/span>reset<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">for<\/span> _ <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">range<\/span><span class=\"token punctuation\">(<\/span><span class=\"token number\">1000<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    action<span class=\"token punctuation\">,<\/span> _states <span class=\"token operator\">&#061;<\/span> model<span class=\"token punctuation\">.<\/span>predict<span class=\"token punctuation\">(<\/span>obs<span class=\"token punctuation\">,<\/span> deterministic<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    obs<span class=\"token punctuation\">,<\/span> reward<span class=\"token punctuation\">,<\/span> terminated<span class=\"token punctuation\">,<\/span> truncated<span class=\"token punctuation\">,<\/span> info <span class=\"token operator\">&#061;<\/span> env<span class=\"token punctuation\">.<\/span>step<span class=\"token punctuation\">(<\/span>action<span class=\"token punctuation\">)<\/span><br \/>\n    env<span class=\"token punctuation\">.<\/span>render<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">if<\/span> terminated <span class=\"token keyword\">or<\/span> truncated<span class=\"token punctuation\">:<\/span><br \/>\n        obs<span class=\"token punctuation\">,<\/span> info <span class=\"token operator\">&#061;<\/span> env<span class=\"token punctuation\">.<\/span>reset<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<ul>\n<li>\u793a\u4f8b&#xff1a;\u81ea\u5b9a\u4e49\u673a\u5668\u4eba\u73af\u5883&#xff08;Gymnasium\u63a5\u53e3&#xff09;<\/li>\n<\/ul>\n<p><span class=\"token keyword\">import<\/span> gymnasium <span class=\"token keyword\">as<\/span> gym<br \/>\n<span class=\"token keyword\">from<\/span> gymnasium <span class=\"token keyword\">import<\/span> spaces<br \/>\n<span class=\"token keyword\">import<\/span> numpy <span class=\"token keyword\">as<\/span> np<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">RobotGraspEnv<\/span><span class=\"token punctuation\">(<\/span>gym<span class=\"token punctuation\">.<\/span>Env<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u81ea\u5b9a\u4e49\u5939\u722a\u6293\u53d6\u73af\u5883\u793a\u4f8b&#034;&#034;&#034;<\/span><\/p>\n<p>    metadata <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">{<\/span><span class=\"token string\">&#039;render_modes&#039;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;human&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">}<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> render_mode<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">None<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token builtin\">super<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>__init__<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># \u5b9a\u4e49\u72b6\u6001\u7a7a\u95f4&#xff1a;\u673a\u68b0\u81c2\u5173\u8282\u89d2\u5ea6(6) &#043; \u5939\u722a\u4f4d\u7f6e(1) &#043; \u76ee\u6807\u4f4d\u7f6e(3)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>observation_space <span class=\"token operator\">&#061;<\/span> spaces<span class=\"token punctuation\">.<\/span>Box<span class=\"token punctuation\">(<\/span><br \/>\n            low<span class=\"token operator\">&#061;<\/span><span class=\"token operator\">&#8211;<\/span>np<span class=\"token punctuation\">.<\/span>inf<span class=\"token punctuation\">,<\/span> high<span class=\"token operator\">&#061;<\/span>np<span class=\"token punctuation\">.<\/span>inf<span class=\"token punctuation\">,<\/span> shape<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">(<\/span><span class=\"token number\">10<\/span><span class=\"token punctuation\">,<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span> dtype<span class=\"token operator\">&#061;<\/span>np<span class=\"token punctuation\">.<\/span>float32<br \/>\n        <span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># \u5b9a\u4e49\u52a8\u4f5c\u7a7a\u95f4&#xff1a;\u5173\u8282\u901f\u5ea6(6) &#043; \u5939\u722a\u5f00\u5408(1)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>action_space <span class=\"token operator\">&#061;<\/span> spaces<span class=\"token punctuation\">.<\/span>Box<span class=\"token punctuation\">(<\/span><br \/>\n            low<span class=\"token operator\">&#061;<\/span><span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1.0<\/span><span class=\"token punctuation\">,<\/span> high<span class=\"token operator\">&#061;<\/span><span class=\"token number\">1.0<\/span><span class=\"token punctuation\">,<\/span> shape<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">(<\/span><span class=\"token number\">7<\/span><span class=\"token punctuation\">,<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span> dtype<span class=\"token operator\">&#061;<\/span>np<span class=\"token punctuation\">.<\/span>float32<br \/>\n        <span class=\"token punctuation\">)<\/span><\/p>\n<p>        self<span class=\"token punctuation\">.<\/span>render_mode <span class=\"token operator\">&#061;<\/span> render_mode<br \/>\n        self<span class=\"token punctuation\">.<\/span>max_steps <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">100<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>current_step <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">0<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">reset<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> seed<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span> options<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">None<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token builtin\">super<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>reset<span class=\"token punctuation\">(<\/span>seed<span class=\"token operator\">&#061;<\/span>seed<span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>current_step <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">0<\/span><\/p>\n<p>        <span class=\"token comment\"># \u521d\u59cb\u5316\u968f\u673a\u72b6\u6001<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>state <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>np_random<span class=\"token punctuation\">.<\/span>uniform<span class=\"token punctuation\">(<\/span><span class=\"token operator\">&#8211;<\/span><span class=\"token number\">0.5<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">0.5<\/span><span class=\"token punctuation\">,<\/span> size<span class=\"token operator\">&#061;<\/span><span class=\"token number\">10<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>target_pos <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>np_random<span class=\"token punctuation\">.<\/span>uniform<span class=\"token punctuation\">(<\/span><span class=\"token operator\">&#8211;<\/span><span class=\"token number\">0.3<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">0.3<\/span><span class=\"token punctuation\">,<\/span> size<span class=\"token operator\">&#061;<\/span><span class=\"token number\">3<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        info <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">{<\/span><span class=\"token punctuation\">}<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> self<span class=\"token punctuation\">.<\/span>state<span class=\"token punctuation\">,<\/span> info<\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">step<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> action<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token comment\"># \u6a21\u62df\u73af\u5883\u52a8\u6001&#xff08;\u5b9e\u9645\u5e94\u8c03\u7528\u7269\u7406\u5f15\u64ce\u5982PyBullet\/MuJoCo&#xff09;<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>state<span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">:<\/span><span class=\"token number\">7<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">&#043;&#061;<\/span> action <span class=\"token operator\">*<\/span> <span class=\"token number\">0.1<\/span>  <span class=\"token comment\"># \u7b80\u5316\u7684\u8fd0\u52a8\u5b66\u66f4\u65b0<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>current_step <span class=\"token operator\">&#043;&#061;<\/span> <span class=\"token number\">1<\/span><\/p>\n<p>        <span class=\"token comment\"># \u8ba1\u7b97\u5956\u52b1<\/span><br \/>\n        gripper_pos <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>state<span class=\"token punctuation\">[<\/span><span class=\"token number\">7<\/span><span class=\"token punctuation\">:<\/span><span class=\"token number\">10<\/span><span class=\"token punctuation\">]<\/span><br \/>\n        distance <span class=\"token operator\">&#061;<\/span> np<span class=\"token punctuation\">.<\/span>linalg<span class=\"token punctuation\">.<\/span>norm<span class=\"token punctuation\">(<\/span>gripper_pos <span class=\"token operator\">&#8211;<\/span> self<span class=\"token punctuation\">.<\/span>target_pos<span class=\"token punctuation\">)<\/span><\/p>\n<p>        reward <span class=\"token operator\">&#061;<\/span> <span class=\"token operator\">&#8211;<\/span>distance  <span class=\"token comment\"># \u8ddd\u79bb\u60e9\u7f5a<\/span><\/p>\n<p>        <span class=\"token comment\"># \u6210\u529f\u6293\u53d6\u5224\u65ad&#xff08;\u7b80\u5316&#xff09;<\/span><br \/>\n        terminated <span class=\"token operator\">&#061;<\/span> distance <span class=\"token operator\">&lt;<\/span> <span class=\"token number\">0.05<\/span><br \/>\n        truncated <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>current_step <span class=\"token operator\">&gt;&#061;<\/span> self<span class=\"token punctuation\">.<\/span>max_steps<\/p>\n<p>        <span class=\"token keyword\">if<\/span> terminated<span class=\"token punctuation\">:<\/span><br \/>\n            reward <span class=\"token operator\">&#043;&#061;<\/span> <span class=\"token number\">10.0<\/span>  <span class=\"token comment\"># \u6210\u529f\u5956\u52b1<\/span><\/p>\n<p>        info <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">{<\/span><span class=\"token string\">&#039;distance&#039;<\/span><span class=\"token punctuation\">:<\/span> distance<span class=\"token punctuation\">}<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> self<span class=\"token punctuation\">.<\/span>state<span class=\"token punctuation\">,<\/span> reward<span class=\"token punctuation\">,<\/span> terminated<span class=\"token punctuation\">,<\/span> truncated<span class=\"token punctuation\">,<\/span> info<\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">render<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> self<span class=\"token punctuation\">.<\/span>render_mode <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;human&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;Step: <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>self<span class=\"token punctuation\">.<\/span>current_step<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">, Distance: <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>np<span class=\"token punctuation\">.<\/span>linalg<span class=\"token punctuation\">.<\/span>norm<span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">.<\/span>state<span class=\"token punctuation\">[<\/span><span class=\"token number\">7<\/span><span class=\"token punctuation\">:<\/span><span class=\"token number\">10<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">&#8211;<\/span> self<span class=\"token punctuation\">.<\/span>target_pos<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.3f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">close<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token keyword\">pass<\/span><\/p>\n<p><span class=\"token comment\"># \u6ce8\u518c\u5e76\u4f7f\u7528\u73af\u5883<\/span><br \/>\ngym<span class=\"token punctuation\">.<\/span>register<span class=\"token punctuation\">(<\/span><span class=\"token builtin\">id<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#039;RobotGrasp-v0&#039;<\/span><span class=\"token punctuation\">,<\/span> entry_point<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#039;__main__:RobotGraspEnv&#039;<\/span><span class=\"token punctuation\">)<\/span><br \/>\nenv <span class=\"token operator\">&#061;<\/span> gym<span class=\"token punctuation\">.<\/span>make<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#039;RobotGrasp-v0&#039;<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<h4>6.3 \u5f00\u6e90\u8d44\u6e90\u5bfc\u822a<\/h4>\n<p>\u7ecf\u5178\u5165\u95e8\u9879\u76ee<\/p>\n<table>\n<tr>\u9879\u76ee\u540d\u79f0\u94fe\u63a5\u63cf\u8ff0<\/tr>\n<tbody>\n<tr>\n<td>Stable Baselines3<\/td>\n<td>https:\/\/github.com\/DLR-RM\/stable-baselines3<\/td>\n<td>\u6700\u63a8\u8350\u7684RL\u7b97\u6cd5\u5e93<\/td>\n<\/tr>\n<tr>\n<td>Gymnasium<\/td>\n<td>https:\/\/github.com\/Farama-Foundation\/Gymnasium<\/td>\n<td>\u6807\u51c6\u73af\u5883\u63a5\u53e3<\/td>\n<\/tr>\n<tr>\n<td>CleanRL<\/td>\n<td>https:\/\/github.com\/vwxyzjn\/cleanrl<\/td>\n<td>\u5355\u6587\u4ef6\u7b97\u6cd5\u5b9e\u73b0&#xff0c;\u9002\u5408\u5b66\u4e60<\/td>\n<\/tr>\n<tr>\n<td>RL Baselines3 Zoo<\/td>\n<td>https:\/\/github.com\/DLR-RM\/rl-baselines3-zoo<\/td>\n<td>\u9884\u8bad\u7ec3\u6a21\u578b\u4e0e\u8c03\u4f18\u6307\u5357<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u5f00\u6e90\u4e66\u7c4d\u4e0e\u6559\u7a0b<\/p>\n<ul>\n<li>\u300aReinforcement Learning: An Introduction\u300b&#xff08;Sutton &amp; Barto&#xff09;&#xff1a;RL\u5723\u7ecf&#xff0c;\u514d\u8d39\u7535\u5b50\u7248<\/li>\n<li>Spinning Up in Deep RL&#xff08;OpenAI&#xff09;&#xff1a;\u6df1\u5ea6RL\u6559\u80b2\u9879\u76ee&#xff0c;\u542b\u7406\u8bba&#043;\u4ee3\u7801<\/li>\n<li>\u52a8\u624b\u5b66\u5f3a\u5316\u5b66\u4e60&#xff08;\u674e\u822a\u7b49&#xff09;&#xff1a;\u4e2d\u6587\u5b9e\u8df5\u6559\u7a0b&#xff0c;\u542bPyTorch\u5b9e\u73b0<\/li>\n<\/ul>\n<p>\u5728\u7ebf\u8bfe\u7a0b<\/p>\n<ul>\n<li>CS285: Deep Reinforcement Learning&#xff08;UC Berkeley, Sergey Levine&#xff09;<\/li>\n<li>Reinforcement Learning Specialization&#xff08;Coursera, University of Alberta&#xff09;<\/li>\n<li>\u674e\u5b8f\u6bc5\u673a\u5668\u5b66\u4e60\u8bfe\u7a0b&#xff1a;\u542bRL\u4e13\u9898&#xff0c;\u4e2d\u6587\u8bb2\u89e3\u6e05\u6670 \u524d\u6cbf\u7814\u7a76\u65b9\u5411<\/li>\n<li>Offline RL&#xff1a;\u4ece\u56fa\u5b9a\u6570\u636e\u96c6\u5b66\u4e60&#xff0c;\u907f\u514d\u5728\u7ebf\u4ea4\u4e92\u98ce\u9669<\/li>\n<li>Multi-Agent RL&#xff1a;\u591a\u667a\u80fd\u4f53\u534f\u4f5c\u4e0e\u7ade\u4e89<\/li>\n<li>Meta-RL&#xff1a;\u5b66\u4f1a\u5b66\u4e60&#xff0c;\u5feb\u901f\u9002\u5e94\u65b0\u4efb\u52a1<\/li>\n<li>Safe RL&#xff1a;\u5b89\u5168\u7ea6\u675f\u4e0b\u7684\u7b56\u7565\u4f18\u5316<\/li>\n<\/ul>\n<h3>\u4e03\u3001\u7ed3\u8bba\u4e0e\u5c55\u671b<\/h3>\n<h4>7.1 \u6280\u672f\u603b\u7ed3<\/h4>\n<ul>\n<li>\u5f3a\u5316\u5b66\u4e60\u4f5c\u4e3a\u9762\u5411\u5e8f\u8d2f\u51b3\u7b56\u7684\u673a\u5668\u5b66\u4e60\u8303\u5f0f&#xff0c;\u5728\u4ee5\u4e0b\u9886\u57df\u5c55\u73b0\u51fa\u5de8\u5927\u6f5c\u529b&#xff1a;<\/li>\n<li>\u6e38\u620fAI&#xff1a;AlphaGo\u3001AlphaStar\u3001OpenAI Five\u7b49\u7a81\u7834\u4eba\u7c7b\u9876\u5c16\u6c34\u5e73<\/li>\n<li>\u673a\u5668\u4eba\u63a7\u5236&#xff1a;\u4ece\u4eff\u771f\u5230\u771f\u5b9e\u73af\u5883\u7684\u6280\u80fd\u8fc1\u79fb\u65e5\u8d8b\u6210\u719f<\/li>\n<li>\u81ea\u52a8\u9a7e\u9a76&#xff1a;\u51b3\u7b56\u89c4\u5212\u6a21\u5757\u7684\u7aef\u5230\u7aef\u4f18\u5316<\/li>\n<li>\u5927\u6a21\u578b\u5bf9\u9f50&#xff1a;RLHF\u6210\u4e3a\u6784\u5efa\u5b89\u5168\u3001\u6709\u7528AI\u7684\u5173\u952e\u6280\u672f<\/li>\n<li>\u8d44\u6e90\u8c03\u5ea6&#xff1a;\u6570\u636e\u4e2d\u5fc3\u51b7\u5374\u3001\u82af\u7247\u8bbe\u8ba1&#xff08;Google TPU\u5e03\u5c40&#xff09;\u3001\u4ea4\u901a\u4fe1\u53f7\u63a7\u5236<\/li>\n<\/ul>\n<h4>7.2 \u5de5\u7a0b\u5b9e\u8df5\u5efa\u8bae<\/h4>\n<ul>\n<li>\u4ece\u4eff\u771f\u5f00\u59cb&#xff1a;\u5229\u7528Gymnasium\u548cPyBullet\u5feb\u901f\u9a8c\u8bc1\u7b97\u6cd5&#xff0c;\u518d\u8fc1\u79fb\u5230\u771f\u5b9e\u786c\u4ef6<\/li>\n<li>\u91cd\u89c6\u5956\u52b1\u8bbe\u8ba1&#xff1a;\u5956\u52b1\u51fd\u6570\u662fRL\u7684&#034;\u7f16\u7a0b\u8bed\u8a00&#034;&#xff0c;\u9700\u7cbe\u5fc3\u8bbe\u8ba1\u907f\u514d\u5956\u52b1\u9ed1\u5ba2&#xff08;Reward Hacking&#xff09;<\/li>\n<li>\u5173\u6ce8\u6837\u672c\u6548\u7387&#xff1a;\u4f18\u5148\u9009\u62e9SAC\u3001TD3\u7b49Off-Policy\u7b97\u6cd5&#xff0c;\u7ed3\u5408\u7ecf\u9a8c\u56de\u653e\u63d0\u5347\u6570\u636e\u5229\u7528\u7387<\/li>\n<li>Sim-to-Real\u6280\u5de7&#xff1a;\u57df\u968f\u673a\u5316\u3001\u7cfb\u7edf\u8fa8\u8bc6\u3001\u6b8b\u5dee\u7b56\u7565\u5b66\u4e60\u7f29\u5c0f\u4eff\u771f\u4e0e\u73b0\u5b9e\u5dee\u8ddd<\/li>\n<li>\u5b89\u5168\u4f18\u5148&#xff1a;\u771f\u5b9e\u673a\u5668\u4eba\u90e8\u7f72\u524d\u5fc5\u987b\u8fdb\u884c\u5145\u5206\u7684\u5b89\u5168\u8fb9\u754c\u6d4b\u8bd5<\/li>\n<\/ul>\n<h4>7.3 \u672a\u6765\u8d8b\u52bf<\/h4>\n<ul>\n<li>\u4e16\u754c\u6a21\u578b&#xff08;World Models&#xff09;&#xff1a;\u5b66\u4e60\u73af\u5883\u7684\u52a8\u6001\u9884\u6d4b\u6a21\u578b&#xff0c;\u63d0\u5347\u89c4\u5212\u80fd\u529b<\/li>\n<li>Transformer\u4e0eRL\u878d\u5408&#xff1a;Decision Transformer\u7b49\u5e8f\u5217\u5efa\u6a21\u65b9\u6cd5\u91cd\u65b0\u5b9a\u4e49RL\u8303\u5f0f<\/li>\n<li>\u5177\u8eab\u667a\u80fd\u7206\u53d1&#xff1a;\u591a\u6a21\u6001\u5927\u6a21\u578b&#043;\u673a\u5668\u4eba\u786c\u4ef6\u7684\u534f\u540c\u8fdb\u5316<\/li>\n<li>\u81ea\u52a8\u5316RL&#xff08;AutoRL&#xff09;&#xff1a;\u8d85\u53c2\u6570\u4f18\u5316\u3001\u7f51\u7edc\u67b6\u6784\u641c\u7d22\u3001\u5956\u52b1\u5b66\u4e60\u81ea\u52a8\u5316<\/li>\n<\/ul>\n<p>\u672c\u6587\u4ec5\u4f9b\u4ea4\u6d41\u5b66\u4e60\u53c2\u8003&#xff0c;\u8bf7\u52ff\u7528\u4e8e\u5546\u4e1a\u7528\u9014\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357&#xff1a;\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218<br \/>\n\u6458\u8981&#xff1a;\u672c\u6587\u7cfb\u7edf\u68b3\u7406\u5f3a\u5316\u5b66\u4e60&#xff08;Reinforcement Learning, RL&#xff09;\u7684\u7406\u8bba\u6846\u67b6\u4e0e\u6280\u672f\u4f53\u7cfb&#xff0c;\u6df1\u5165\u5256\u6790\u5176\u4e0e\u76d1\u7763\u5b66\u4e60\u3001\u65e0\u76d1\u7763\u5b66\u4e60\u7684\u672c\u8d28\u5dee\u5f02&#xff0c;\u91cd\u70b9\u8bb2\u89e3\u57fa\u4e8e\u4eba\u7c7b\u53cd\u9988\u7684\u5f3a\u5316\u5b66\u4e60&#xff08;RLHF&#xff09;\u5728GPT\u7b49\u5927\u6a21\u578b\u8bad\u7ec3\u4e2d\u7684\u5e94\u7528&#xff0c;\u5e76\u901a\u8fc7\u5177\u8eab\u667a\u80fd&#xff08;Embodied AI&#xff09;\u4e0e\u673a\u5668\u4eba\u63a7\u5236\u5b9e\u6218\u6848<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[384],"topic":[],"class_list":["post-75296","post","type-post","status-publish","format-standard","hentry","category-server","tag-384"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v20.3 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357\uff1a\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.wsisp.com\/helps\/75296.html\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357\uff1a\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"og:description\" content=\"\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357&#xff1a;\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218 \u6458\u8981&#xff1a;\u672c\u6587\u7cfb\u7edf\u68b3\u7406\u5f3a\u5316\u5b66\u4e60&#xff08;Reinforcement Learning, RL&#xff09;\u7684\u7406\u8bba\u6846\u67b6\u4e0e\u6280\u672f\u4f53\u7cfb&#xff0c;\u6df1\u5165\u5256\u6790\u5176\u4e0e\u76d1\u7763\u5b66\u4e60\u3001\u65e0\u76d1\u7763\u5b66\u4e60\u7684\u672c\u8d28\u5dee\u5f02&#xff0c;\u91cd\u70b9\u8bb2\u89e3\u57fa\u4e8e\u4eba\u7c7b\u53cd\u9988\u7684\u5f3a\u5316\u5b66\u4e60&#xff08;RLHF&#xff09;\u5728GPT\u7b49\u5927\u6a21\u578b\u8bad\u7ec3\u4e2d\u7684\u5e94\u7528&#xff0c;\u5e76\u901a\u8fc7\u5177\u8eab\u667a\u80fd&#xff08;Embodied AI&#xff09;\u4e0e\u673a\u5668\u4eba\u63a7\u5236\u5b9e\u6218\u6848\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.wsisp.com\/helps\/75296.html\" \/>\n<meta property=\"og:site_name\" content=\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"article:published_time\" content=\"2026-02-11T15:57:43+00:00\" \/>\n<meta name=\"author\" content=\"admin\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"admin\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"8 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/75296.html\",\"url\":\"https:\/\/www.wsisp.com\/helps\/75296.html\",\"name\":\"\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357\uff1a\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"isPartOf\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\"},\"datePublished\":\"2026-02-11T15:57:43+00:00\",\"dateModified\":\"2026-02-11T15:57:43+00:00\",\"author\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\"},\"breadcrumb\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/75296.html#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.wsisp.com\/helps\/75296.html\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/75296.html#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/www.wsisp.com\/helps\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357\uff1a\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\",\"url\":\"https:\/\/www.wsisp.com\/helps\/\",\"name\":\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"description\":\"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\",\"name\":\"admin\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"contentUrl\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"caption\":\"admin\"},\"sameAs\":[\"http:\/\/wp.wsisp.com\"],\"url\":\"https:\/\/www.wsisp.com\/helps\/author\/admin\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357\uff1a\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.wsisp.com\/helps\/75296.html","og_locale":"zh_CN","og_type":"article","og_title":"\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357\uff1a\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","og_description":"\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357&#xff1a;\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218 \u6458\u8981&#xff1a;\u672c\u6587\u7cfb\u7edf\u68b3\u7406\u5f3a\u5316\u5b66\u4e60&#xff08;Reinforcement Learning, RL&#xff09;\u7684\u7406\u8bba\u6846\u67b6\u4e0e\u6280\u672f\u4f53\u7cfb&#xff0c;\u6df1\u5165\u5256\u6790\u5176\u4e0e\u76d1\u7763\u5b66\u4e60\u3001\u65e0\u76d1\u7763\u5b66\u4e60\u7684\u672c\u8d28\u5dee\u5f02&#xff0c;\u91cd\u70b9\u8bb2\u89e3\u57fa\u4e8e\u4eba\u7c7b\u53cd\u9988\u7684\u5f3a\u5316\u5b66\u4e60&#xff08;RLHF&#xff09;\u5728GPT\u7b49\u5927\u6a21\u578b\u8bad\u7ec3\u4e2d\u7684\u5e94\u7528&#xff0c;\u5e76\u901a\u8fc7\u5177\u8eab\u667a\u80fd&#xff08;Embodied AI&#xff09;\u4e0e\u673a\u5668\u4eba\u63a7\u5236\u5b9e\u6218\u6848","og_url":"https:\/\/www.wsisp.com\/helps\/75296.html","og_site_name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","article_published_time":"2026-02-11T15:57:43+00:00","author":"admin","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"admin","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"8 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.wsisp.com\/helps\/75296.html","url":"https:\/\/www.wsisp.com\/helps\/75296.html","name":"\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357\uff1a\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","isPartOf":{"@id":"https:\/\/www.wsisp.com\/helps\/#website"},"datePublished":"2026-02-11T15:57:43+00:00","dateModified":"2026-02-11T15:57:43+00:00","author":{"@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41"},"breadcrumb":{"@id":"https:\/\/www.wsisp.com\/helps\/75296.html#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.wsisp.com\/helps\/75296.html"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.wsisp.com\/helps\/75296.html#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/www.wsisp.com\/helps"},{"@type":"ListItem","position":2,"name":"\u6df1\u5ea6\u5f3a\u5316\u5b66\u4e60\u5168\u6808\u6307\u5357\uff1a\u4ece\u7406\u8bba\u539f\u7406\u5230\u673a\u5668\u4eba\u5b9e\u6218"}]},{"@type":"WebSite","@id":"https:\/\/www.wsisp.com\/helps\/#website","url":"https:\/\/www.wsisp.com\/helps\/","name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","description":"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41","name":"admin","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/","url":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","contentUrl":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","caption":"admin"},"sameAs":["http:\/\/wp.wsisp.com"],"url":"https:\/\/www.wsisp.com\/helps\/author\/admin"}]}},"_links":{"self":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/75296","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/comments?post=75296"}],"version-history":[{"count":0,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/75296\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media?parent=75296"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/categories?post=75296"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/tags?post=75296"},{"taxonomy":"topic","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/topic?post=75296"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}