{"id":69072,"date":"2026-01-31T03:57:37","date_gmt":"2026-01-30T19:57:37","guid":{"rendered":"https:\/\/www.wsisp.com\/helps\/69072.html"},"modified":"2026-01-31T03:57:37","modified_gmt":"2026-01-30T19:57:37","slug":"gemini-%e7%9a%84%e5%a4%9a%e6%a8%a1%e6%80%81%e6%9e%b6%e6%9e%84%e8%ae%be%e8%ae%a1%e5%a6%82%e4%bd%95%e7%bb%9f%e4%b8%80%e6%96%87%e6%9c%ac%e3%80%81%e5%9b%be%e5%83%8f%e3%80%81%e8%a7%86%e9%a2%91%e7%9a%84","status":"publish","type":"post","link":"https:\/\/www.wsisp.com\/helps\/69072.html","title":{"rendered":"Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a\uff1f"},"content":{"rendered":"<h2>Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a&#xff1f;<\/h2>\n<h3>\u6587\u6863\u6982\u8ff0<\/h3>\n<h4>\u672c\u6587\u6838\u5fc3\u4ef7\u503c<\/h4>\n<li>\u6df1\u5ea6\u62c6\u89e3 Gemini \u591a\u6a21\u6001\u67b6\u6784\u7684\u6838\u5fc3\u8bbe\u8ba1\u903b\u8f91&#xff0c;\u89e3\u91ca\u5176\u5982\u4f55\u5b9e\u73b0\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u7edf\u4e00\u8868\u793a<\/li>\n<li>\u638c\u63e1 Gemini \u591a\u6a21\u6001\u6a21\u578b\u7684\u672c\u5730\u90e8\u7f72\u4e0e\u73af\u5883\u914d\u7f6e\u65b9\u6cd5<\/li>\n<li>\u901a\u8fc7\u53ef\u8fd0\u884c\u7684\u4ee3\u7801\u5b9e\u6218&#xff0c;\u5b9e\u73b0\u6587\u672c \/ \u56fe\u50cf \/ \u89c6\u9891\u7684\u7edf\u4e00\u5411\u91cf\u8868\u793a\u4e0e\u8de8\u6a21\u6001\u68c0\u7d22<\/li>\n<li>\u7406\u89e3\u591a\u6a21\u6001\u8868\u793a\u7edf\u4e00\u7684\u8c03\u8bd5\u3001\u4f18\u5316\u65b9\u6cd5\u4e0e\u751f\u4ea7\u7ea7\u90e8\u7f72\u6280\u5de7<\/li>\n<h4>\u5b66\u4e60\u76ee\u6807<\/h4>\n<li>\u7406\u89e3 Gemini \u591a\u6a21\u6001\u67b6\u6784\u7684\u6838\u5fc3\u7ec4\u4ef6\u4e0e\u8de8\u6a21\u6001\u5bf9\u9f50\u539f\u7406<\/li>\n<li>\u638c\u63e1 Gemini \u6a21\u578b\u672c\u5730\u90e8\u7f72\u7684\u73af\u5883\u914d\u7f6e\u4e0e\u4f9d\u8d56\u5b89\u88c5<\/li>\n<li>\u7cbe\u901a\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u6a21\u6001\u8f6c\u6362\u4e0e\u7edf\u4e00\u8868\u793a\u65b9\u6cd5<\/li>\n<li>\u80fd\u591f\u72ec\u7acb\u5b9e\u73b0\u591a\u6a21\u6001\u6570\u636e\u7684\u5411\u91cf\u7f16\u7801\u4e0e\u8de8\u6a21\u6001\u68c0\u7d22\u7cfb\u7edf<\/li>\n<li>\u5b66\u4f1a\u591a\u6a21\u6001\u6a21\u578b\u7684\u6027\u80fd\u4f18\u5316\u4e0e\u90e8\u7f72\u6700\u4f73\u5b9e\u8df5<\/li>\n<h3>\u4e00\u3001Gemini \u591a\u6a21\u6001\u6280\u672f\u6982\u8ff0<\/h3>\n<h4>1.1 \u4ec0\u4e48\u662f Gemini \u591a\u6a21\u6001\u6a21\u578b&#xff1f;<\/h4>\n<p>Gemini \u662f Google DeepMind \u63a8\u51fa\u7684\u65b0\u4e00\u4ee3\u591a\u6a21\u6001\u5927\u6a21\u578b&#xff0c;\u6838\u5fc3\u5b9a\u4f4d\u662f\u539f\u751f\u652f\u6301\u6587\u672c\u3001\u56fe\u50cf\u3001\u97f3\u9891\u3001\u89c6\u9891\u7b49\u591a\u6a21\u6001\u8f93\u5165&#xff0c;\u5e76\u5728\u7edf\u4e00\u7684\u5f20\u91cf\u7a7a\u95f4\u4e2d\u5b8c\u6210\u8868\u793a\u4e0e\u63a8\u7406\u3002\u4e0e\u4f20\u7edf \u201c\u5355\u6a21\u6001\u6a21\u578b &#043; \u6a21\u6001\u8f6c\u6362\u6a21\u5757\u201d \u7684\u62fc\u63a5\u5f0f\u67b6\u6784\u4e0d\u540c&#xff0c;Gemini \u4ece\u5e95\u5c42\u8bbe\u8ba1\u4e0a\u5b9e\u73b0\u4e86\u591a\u6a21\u6001\u7684\u6df1\u5ea6\u878d\u5408&#xff0c;\u771f\u6b63\u505a\u5230 \u201c\u4e00\u4e2a\u6a21\u578b\u5904\u7406\u6240\u6709\u6a21\u6001\u201d\u3002<\/p>\n<h4>1.2 \u4f20\u7edf\u591a\u6a21\u6001\u65b9\u6848\u7684\u5c40\u9650\u6027&#xff08;Gemini \u89e3\u51b3\u7684\u6838\u5fc3\u75db\u70b9&#xff09;<\/h4>\n<table>\n<tr>\u75db\u70b9\u7ef4\u5ea6\u4f20\u7edf\u65b9\u6848\u95ee\u9898Gemini \u89e3\u51b3\u65b9\u6848<\/tr>\n<tbody>\n<tr>\n<td align=\"center\">\u6a21\u6001\u5272\u88c2<\/td>\n<td align=\"center\">\u6587\u672c \/ \u56fe\u50cf \/ \u89c6\u9891\u5404\u81ea\u4f7f\u7528\u72ec\u7acb\u6a21\u578b\u7f16\u7801&#xff0c;\u4ec5\u5728\u8f93\u51fa\u5c42\u878d\u5408<\/td>\n<td align=\"center\">\u5171\u4eab\u7edf\u4e00\u7684\u7f16\u7801\u5668\u4e0e\u5f20\u91cf\u7a7a\u95f4&#xff0c;\u6240\u6709\u6a21\u6001\u4ece\u8f93\u5165\u5373\u5bf9\u9f50<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">\u65f6\u5e8f\u7f3a\u5931<\/td>\n<td align=\"center\">\u89c6\u9891\u5904\u7406\u7b80\u5316\u4e3a \u201c\u56fe\u50cf\u5e27\u5806\u53e0\u201d&#xff0c;\u4e22\u5931\u65f6\u5e8f\u4fe1\u606f<\/td>\n<td align=\"center\">\u5f15\u5165\u65f6\u7a7a\u6ce8\u610f\u529b\u673a\u5236&#xff0c;\u5efa\u6a21\u89c6\u9891\u7684\u5e27\u95f4\u65f6\u5e8f\u4f9d\u8d56<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">\u8868\u793a\u4e0d\u4e00\u81f4<\/td>\n<td align=\"center\">\u4e0d\u540c\u6a21\u6001\u7684\u5411\u91cf\u7a7a\u95f4\u65e0\u7edf\u4e00\u5ea6\u91cf\u6807\u51c6&#xff0c;\u8de8\u6a21\u6001\u68c0\u7d22\u7cbe\u5ea6\u4f4e<\/td>\n<td align=\"center\">\u7edf\u4e00\u7684\u5f52\u4e00\u5316\u8868\u793a\u7a7a\u95f4&#xff0c;\u652f\u6301\u8de8\u6a21\u6001\u76f8\u4f3c\u5ea6\u8ba1\u7b97<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">\u8ba1\u7b97\u6548\u7387\u4f4e<\/td>\n<td align=\"center\">\u591a\u6a21\u6001\u5904\u7406\u9700\u591a\u6b21\u6a21\u578b\u8c03\u7528&#xff0c;\u5ef6\u8fdf\u9ad8<\/td>\n<td align=\"center\">\u7aef\u5230\u7aef\u7684\u591a\u6a21\u6001\u7f16\u7801&#xff0c;\u5355\u6b21\u524d\u5411\u4f20\u64ad\u5b8c\u6210\u591a\u6a21\u6001\u8868\u793a<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">\u4e0a\u4e0b\u6587\u5272\u88c2<\/td>\n<td align=\"center\">\u957f\u89c6\u9891 \/ \u957f\u6587\u672c\u5904\u7406\u65f6\u4e0a\u4e0b\u6587\u4fe1\u606f\u4e22\u5931<\/td>\n<td align=\"center\">\u652f\u6301\u8d85\u957f\u4e0a\u4e0b\u6587\u7a97\u53e3&#xff08;Gemini 1.5 Pro \u652f\u6301 1M tokens&#xff09;<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h4>1.3 Gemini \u591a\u6a21\u6001\u8868\u793a\u7edf\u4e00\u7684\u6838\u5fc3\u8bbe\u8ba1\u539f\u5219<\/h4>\n<h4>1.3 Gemini \u591a\u6a21\u6001\u8868\u793a\u7edf\u4e00\u7684\u6838\u5fc3\u8bbe\u8ba1\u539f\u5219<\/h4>\n<p>Gemini \u7684\u6838\u5fc3\u7a81\u7834\u5728\u4e8e\u6452\u5f03\u4e86\u4f20\u7edf\u591a\u6a21\u6001\u6a21\u578b\u201c\u62fc\u63a5\u5f0f\u201d\u7684\u5f02\u6784\u8bbe\u8ba1&#xff0c;\u8f6c\u800c\u91c7\u7528\u539f\u751f&#xff08;Native&#xff09;\u591a\u6a21\u6001\u67b6\u6784\u3002\u8fd9\u610f\u5473\u7740\u6a21\u578b\u4e0d\u518d\u9700\u8981\u9488\u5bf9\u4e0d\u540c\u6a21\u6001\u8bad\u7ec3\u72ec\u7acb\u7684\u7f16\u7801\u5668&#xff08;\u5982 CLIP \u4e2d\u7684 Image Encoder \u548c Text Encoder&#xff09;&#xff0c;\u800c\u662f\u901a\u8fc7\u4ee5\u4e0b\u4e94\u5927\u539f\u5219&#xff0c;\u5728\u5e95\u5c42\u7269\u7406\u7a7a\u95f4\u548c\u9ad8\u5c42\u8bed\u4e49\u7a7a\u95f4\u4e0a\u5b9e\u73b0\u4e86\u5f7b\u5e95\u7684\u7edf\u4e00\u3002<\/p>\n<h5>1.3.1 \u539f\u7406\u67b6\u6784\u6811\u5f62\u56fe (Conceptual Tree)<\/h5>\n<p>Gemini_Unified_Principles\/  <span class=\"token comment\"># \u6838\u5fc3\u8bbe\u8ba1\u539f\u5219\u5168\u666f<\/span><br \/>\n\u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token number\">1<\/span>. Universal_Input <span class=\"token punctuation\">(<\/span>\u4e07\u7269\u7686\u5e8f\u5217&#xff1a;\u7269\u7406\u5c42\u7684\u7edf\u4e00<span class=\"token punctuation\">)<\/span><br \/>\n\u2502   \u251c\u2500\u2500 Text_Stream   \u2500\u2500<span class=\"token operator\">&gt;<\/span> <span class=\"token punctuation\">[<\/span>Tokenization<span class=\"token punctuation\">]<\/span> \u2500\u2500<span class=\"token operator\">&gt;<\/span> 1D ID\u5e8f\u5217<br \/>\n\u2502   \u251c\u2500\u2500 Image_Stream  \u2500\u2500<span class=\"token operator\">&gt;<\/span> <span class=\"token punctuation\">[<\/span>Patch Partition<span class=\"token punctuation\">]<\/span> \u2500\u2500<span class=\"token operator\">&gt;<\/span> \u7ebf\u6027\u6295\u5f71 \u2500\u2500<span class=\"token operator\">&gt;<\/span> \u4f2a1D\u5e8f\u5217<br \/>\n\u2502   \u2514\u2500\u2500 Video_Stream  \u2500\u2500<span class=\"token operator\">&gt;<\/span> <span class=\"token punctuation\">[<\/span>3D Tubelet<span class=\"token punctuation\">]<\/span> \u2500\u2500<span class=\"token operator\">&gt;<\/span> \u65f6\u7a7a\u538b\u5e73 \u2500\u2500<span class=\"token operator\">&gt;<\/span> \u5e26\u6709\u65f6\u95f4\u6233\u7684\u5e8f\u5217<br \/>\n\u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token number\">2<\/span>. Shared_Backbone <span class=\"token punctuation\">(<\/span>\u5171\u4eab\u5927\u8111&#xff1a;\u8ba1\u7b97\u5c42\u7684\u7edf\u4e00<span class=\"token punctuation\">)<\/span><br \/>\n\u2502   \u251c\u2500\u2500 Single_Transformer  <span class=\"token comment\"># \u62d2\u7edd\u53cc\u5854\u7ed3\u6784&#xff0c;\u6240\u6709\u6a21\u6001\u8fdb\u5165\u540c\u4e00\u4e2a\u7f51\u7edc<\/span><br \/>\n\u2502   \u251c\u2500\u2500 Shared_Parameters   <span class=\"token comment\"># \u6587\u672c\u548c\u56fe\u50cf\u5171\u7528\u540c\u4e00\u7ec4\u6743\u91cd\u8fdb\u884c\u63a8\u7406<\/span><br \/>\n\u2502   \u2514\u2500\u2500 Early_Fusion        <span class=\"token comment\"># \u6a21\u6001\u878d\u5408\u53d1\u751f\u5728\u7b2c\u4e00\u5c42&#xff0c;\u800c\u975e\u6700\u540e\u8f93\u51fa\u5c42<\/span><br \/>\n\u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token number\">3<\/span>. Attention_Mechanism <span class=\"token punctuation\">(<\/span>\u52a8\u6001\u5bf9\u9f50&#xff1a;\u4ea4\u4e92\u5c42\u7684\u7edf\u4e00<span class=\"token punctuation\">)<\/span><br \/>\n\u2502   \u251c\u2500\u2500 Self_Attention      <span class=\"token comment\"># \u6a21\u6001\u5185\u90e8\u5efa\u6a21 (\u5982&#xff1a;\u7406\u89e3\u53e5\u5b50\u8bed\u6cd5 \/ \u56fe\u50cf\u7eb9\u7406)<\/span><br \/>\n\u2502   \u251c\u2500\u2500 Cross_Attention     <span class=\"token comment\"># \u8de8\u6a21\u6001\u6865\u63a5 (\u5982&#xff1a;\u5355\u8bcd&#034;\u732b&#034; &lt;-&gt; \u56fe\u50cf\u4e2d\u732b\u7684\u50cf\u7d20)<\/span><br \/>\n\u2502   \u2514\u2500\u2500 Temporal_Attention  <span class=\"token comment\"># (\u89c6\u9891\u4e13\u7528) \u6cbf\u65f6\u95f4\u8f74\u6355\u6349\u52a8\u4f5c\u53d8\u5316<\/span><br \/>\n\u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token number\">4<\/span>. Unified_Metric <span class=\"token punctuation\">(<\/span>\u5ea6\u91cf\u4e00\u81f4&#xff1a;\u8bed\u4e49\u5c42\u7684\u7edf\u4e00<span class=\"token punctuation\">)<\/span><br \/>\n    \u251c\u2500\u2500 Projection_Layer    <span class=\"token comment\"># \u5f3a\u5236\u7ef4\u5ea6\u5bf9\u9f50 (\u5982\u7edf\u4e00\u6620\u5c04\u5230 2048 \u7ef4)<\/span><br \/>\n    \u2514\u2500\u2500 L2_Normalization    <span class=\"token comment\"># \u5f52\u4e00\u5316\u5230\u8d85\u7403\u9762\u4e0a&#xff0c;\u786e\u4fdd Cosine \u8ddd\u79bb\u53ef\u6bd4<\/span><\/p>\n<h5>1.3.2 \u6838\u5fc3\u539f\u5219\u6df1\u5ea6\u89e3\u6790<\/h5>\n<p>1. \u6a21\u6001\u65e0\u5173\u7684\u8f93\u5165\u7f16\u7801 (Modality-Agnostic Input Encoding)<\/p>\n<ul>\n<li>\u539f\u7406&#xff1a;Gemini \u5efa\u7acb\u4e86\u4e00\u4e2a\u201c\u901a\u7528\u8bcd\u6c47\u8868\u201d\u3002\u5b83\u4e0d\u76f4\u63a5\u5904\u7406\u539f\u59cb\u7684\u50cf\u7d20\u6216\u97f3\u9891\u6ce2\u5f62&#xff0c;\u800c\u662f\u5c06\u5b83\u4eec\u89c6\u4e3a\u4e00\u79cd\u201c\u5916\u8bed\u201d&#xff0c;\u7ffb\u8bd1\u6210\u6a21\u578b\u80fd\u7406\u89e3\u7684 Token\u3002<\/li>\n<li>\u6269\u5145\u7ec6\u8282&#xff1a;\n<ul>\n<li>\u6587\u672c&#xff1a;\u4f7f\u7528 SentencePiece \u6a21\u578b\u5c06\u81ea\u7136\u8bed\u8a00\u5207\u5206\u4e3a\u5b50\u8bcd Token\u3002<\/li>\n<li>\u56fe\u50cf&#xff1a;\u91c7\u7528 Patch Embedding \u6280\u672f&#xff0c;\u5c06 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>           H<\/p>\n<p>           \u00d7<\/p>\n<p>           W<\/p>\n<p>          H \\\\times W<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7667em;vertical-align: -0.0833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0813em\">H<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><\/span><\/span><\/span><\/span> \u7684\u56fe\u50cf\u5207\u5206\u4e3a <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           16<\/p>\n<p>           \u00d7<\/p>\n<p>           16<\/p>\n<p>          16 \\\\times 16<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">16<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6444em\"><\/span><span class=\"mord\">16<\/span><\/span><\/span><\/span><\/span> \u7684\u5c0f\u65b9\u5757&#xff0c;\u901a\u8fc7\u7ebf\u6027\u5c42\u6295\u5f71\u4e3a\u5411\u91cf&#xff0c;\u5e76\u52a0\u4e0a 2D \u4f4d\u7f6e\u7f16\u7801\u3002\u5728\u6a21\u578b\u770b\u6765&#xff0c;\u8fd9\u4e0d\u518d\u662f\u56fe\u7247&#xff0c;\u800c\u662f\u4e00\u53e5\u201c\u7531\u50cf\u7d20\u5757\u7ec4\u6210\u7684\u53e5\u5b50\u201d\u3002<\/li>\n<li>\u89c6\u9891&#xff1a;\u4e0d\u53ea\u662f\u56fe\u50cf\u7684\u5806\u53e0&#xff0c;\u800c\u662f\u91c7\u7528 3D Tubelet \u6216\u65f6\u7a7a\u91c7\u6837&#xff0c;\u5c06\u65f6\u95f4\u7ef4\u5ea6 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>           T<\/p>\n<p>          T<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">T<\/span><\/span><\/span><\/span><\/span> \u4e5f\u7eb3\u5165\u5207\u5206\u8303\u56f4&#xff0c;\u5f62\u6210 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           (<\/p>\n<p>           t<\/p>\n<p>           ,<\/p>\n<p>           h<\/p>\n<p>           ,<\/p>\n<p>           w<\/p>\n<p>           )<\/p>\n<p>          (t, h, w)<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">t<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\">h<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0269em\">w<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span> \u7684\u4e09\u7ef4 Token&#xff0c;\u4fdd\u7559\u4e86\u65f6\u5e8f\u6d41\u52a8\u7684\u7269\u7406\u7279\u5f81\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>2. \u5171\u4eab\u7684 Transformer \u9aa8\u5e72\u7f51\u7edc (Shared Transformer Backbone)<\/p>\n<ul>\n<li>\u539f\u7406&#xff1a;\u8fd9\u662f\u201c\u539f\u751f\u591a\u6a21\u6001\u201d\u7684\u6807\u5fd7\u3002\u6240\u6709\u6a21\u6001\u7684\u6570\u636e\u6d41\u7ecf\u540c\u4e00\u4e2a\u795e\u7ecf\u7f51\u7edc&#xff0c;\u4f7f\u7528\u540c\u4e00\u5957\u53c2\u6570&#xff08;\u6743\u91cd\/\u504f\u7f6e&#xff09;\u8fdb\u884c\u8ba1\u7b97\u3002<\/li>\n<li>\u6269\u5145\u7ec6\u8282&#xff1a;\n<ul>\n<li>\u53c2\u6570\u6548\u7387&#xff1a;\u4e0e\u201c\u53cc\u5854\u7ed3\u6784\u201d&#xff08;\u72ec\u7acb\u89c6\u89c9\u5854 &#043; \u72ec\u7acb\u6587\u672c\u5854&#xff09;\u76f8\u6bd4&#xff0c;Gemini \u7684\u53c2\u6570\u5229\u7528\u7387\u6781\u9ad8\u3002\u6a21\u578b\u5b66\u4f1a\u7684\u201c\u903b\u8f91\u63a8\u7406\u80fd\u529b\u201d\u53ef\u4ee5\u540c\u65f6\u4f5c\u7528\u4e8e\u6587\u672c\u5206\u6790\u548c\u56fe\u50cf\u7406\u89e3\u3002<\/li>\n<li>\u65e9\u671f\u878d\u5408 (Early Fusion)&#xff1a;\u6a21\u6001\u95f4\u7684\u4ea4\u4e92\u4ece\u7f51\u7edc\u7684\u7b2c\u4e00\u5c42\u5c31\u5f00\u59cb\u4e86&#xff0c;\u800c\u4e0d\u662f\u50cf\u4f20\u7edf\u6a21\u578b\u90a3\u6837\u7b49\u5230\u6700\u540e\u624d\u8fdb\u884c\u7279\u5f81\u62fc\u63a5\u3002\u8fd9\u4f7f\u5f97\u6a21\u578b\u80fd\u6355\u6349\u5230\u66f4\u6df1\u5c42\u6b21\u7684\u56fe\u6587\u9690\u55bb\u5173\u7cfb\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>3. \u8de8\u6a21\u6001\u6ce8\u610f\u529b\u5bf9\u9f50 (Cross-Modal Attention Alignment)<\/p>\n<ul>\n<li>\u539f\u7406&#xff1a;\u5229\u7528 Attention \u673a\u5236\u52a8\u6001\u5efa\u7acb\u4e0d\u540c\u6a21\u6001 Token \u4e4b\u95f4\u7684\u8bed\u4e49\u94fe\u63a5\u3002<\/li>\n<li>\u6269\u5145\u7ec6\u8282&#xff1a;\n<ul>\n<li>\u7ec6\u7c92\u5ea6\u5bf9\u9f50&#xff1a;\u4f20\u7edf\u6a21\u578b\u53ea\u80fd\u5224\u65ad\u201c\u6574\u5f20\u56fe\u201d\u548c\u201c\u6574\u53e5\u8bdd\u201d\u662f\u5426\u5339\u914d\u3002Gemini \u901a\u8fc7\u4ea4\u53c9\u6ce8\u610f\u529b&#xff08;Cross-Attention&#xff09;&#xff0c;\u53ef\u4ee5\u8ba9\u6587\u672c\u5e8f\u5217\u4e2d\u7684 Token&#xff08;Query&#xff09;\u53bb\u67e5\u8be2\u89c6\u89c9\u5e8f\u5217\u4e2d\u7684 Patch&#xff08;Key\/Value&#xff09;\u3002\u4f8b\u5982&#xff0c;\u5f53\u5904\u7406\u6587\u672c\u201c\u6234\u7ea2\u5e3d\u5b50\u7684\u7537\u5b69\u201d\u65f6&#xff0c;\u6a21\u578b\u7684\u6ce8\u610f\u529b\u5934\u4f1a\u9ad8\u4eae\u56fe\u50cf\u4e2d\u201c\u7ea2\u8272\u201d\u548c\u201c\u5e3d\u5b50\u201d\u5bf9\u5e94\u7684 Patch \u533a\u57df\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>4. \u65f6\u7a7a\u8054\u5408\u5efa\u6a21 (Joint Spatiotemporal Modeling)<\/p>\n<ul>\n<li>\u539f\u7406&#xff1a;\u9488\u5bf9\u89c6\u9891\u6a21\u6001&#xff0c;\u6a21\u578b\u5fc5\u987b\u7406\u89e3\u201c\u53d8\u5316\u201d\u3002<\/li>\n<li>\u6269\u5145\u7ec6\u8282&#xff1a;\n<ul>\n<li>3D \u4f4d\u7f6e\u7f16\u7801&#xff1a;\u9664\u4e86 x, y \u7a7a\u95f4\u5750\u6807&#xff0c;\u6bcf\u4e2a\u89c6\u9891 Token \u8fd8\u88ab\u8d4b\u4e88\u4e86 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>           t<\/p>\n<p>          t<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6151em\"><\/span><span class=\"mord mathnormal\">t<\/span><\/span><\/span><\/span><\/span> \u65f6\u95f4\u5750\u6807\u3002<\/li>\n<li>\u56e0\u679c\u6ce8\u610f\u529b&#xff1a;\u5728\u7406\u89e3\u89c6\u9891\u65f6&#xff0c;\u6a21\u578b\u4e0d\u4ec5\u5173\u6ce8\u5f53\u524d\u5e27&#xff0c;\u8fd8\u4f1a\u901a\u8fc7\u65f6\u5e8f\u6ce8\u610f\u529b\u673a\u5236&#xff08;Temporal Attention&#xff09;\u56de\u987e\u5386\u53f2\u5e27&#xff0c;\u4ece\u800c\u7406\u89e3\u52a8\u4f5c\u7684\u8fde\u7eed\u6027&#xff08;\u5982\u5206\u8fa8\u201c\u4eba\u5750\u4e0b\u201d\u548c\u201c\u4eba\u7ad9\u8d77\u201d\u7684\u533a\u522b&#xff09;\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>5. \u7edf\u4e00\u7684\u5411\u91cf\u5f52\u4e00\u5316 (Unified Vector Normalization)<\/p>\n<ul>\n<li>\u539f\u7406&#xff1a;\u6d88\u9664\u4e0d\u540c\u6a21\u6001\u5728\u7279\u5f81\u5206\u5e03\u4e0a\u7684\u201c\u91cf\u7eb2\u201d\u5dee\u5f02&#xff0c;\u786e\u4fdd\u5b83\u4eec\u5728\u540c\u4e00\u4e2a\u51e0\u4f55\u7a7a\u95f4\u4e2d\u53ef\u6bd4\u8f83\u3002<\/li>\n<li>\u6269\u5145\u7ec6\u8282&#xff1a;\n<ul>\n<li>\u8d85\u7403\u9762\u6295\u5f71&#xff1a;\u6240\u6709\u8f93\u51fa\u5411\u91cf\u7ecf\u8fc7 LayerNorm \u548c L2 \u5f52\u4e00\u5316\u540e&#xff0c;\u90fd\u88ab\u6295\u5c04\u5230\u540c\u4e00\u4e2a\u5355\u4f4d\u8d85\u7403\u9762\u4e0a\u3002<\/li>\n<li>\u6570\u5b66\u610f\u4e49&#xff1a;\u8fd9\u4fdd\u8bc1\u4e86\u6211\u4eec\u53ef\u4ee5\u76f4\u63a5\u4f7f\u7528\u4f59\u5f26\u76f8\u4f3c\u5ea6 (Cosine Similarity) \u6765\u8861\u91cf\u4efb\u4f55\u4e24\u4e2a\u5bf9\u8c61\u7684\u8ddd\u79bb\u3002\u5982\u679c\u6587\u672c\u201c\u72d7\u201d\u548c\u56fe\u50cf\u201c\u72d7\u201d\u7684\u5411\u91cf\u5939\u89d2\u4e3a 0 \u5ea6&#xff0c;\u8bf4\u660e\u6a21\u578b\u771f\u6b63\u5b9e\u73b0\u4e86\u8bed\u4e49\u4e0a\u7684\u7edf\u4e00&#xff0c;\u800c\u4e0d\u4ec5\u4ec5\u662f\u6570\u503c\u4e0a\u7684\u63a5\u8fd1\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h3>\u4e8c\u3001Gemini \u672c\u5730\u90e8\u7f72\u4e0e\u73af\u5883\u914d\u7f6e<\/h3>\n<h4>2.1 \u90e8\u7f72\u73af\u5883\u8981\u6c42<\/h4>\n<table>\n<tr>\u73af\u5883\u7c7b\u578b\u6700\u4f4e\u914d\u7f6e\u63a8\u8350\u914d\u7f6e<\/tr>\n<tbody>\n<tr>\n<td align=\"center\">\u64cd\u4f5c\u7cfb\u7edf<\/td>\n<td align=\"center\">Windows 10\/11\u3001macOS 13&#043;\u3001Linux (Ubuntu 20.04&#043;)<\/td>\n<td align=\"center\">Linux (Ubuntu 22.04)<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">CPU<\/td>\n<td align=\"center\">8 \u6838 16 \u7ebf\u7a0b<\/td>\n<td align=\"center\">16 \u6838 32 \u7ebf\u7a0b<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">\u5185\u5b58<\/td>\n<td align=\"center\">32GB<\/td>\n<td align=\"center\">64GB&#043;<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">GPU<\/td>\n<td align=\"center\">NVIDIA RTX 3090 (24GB)<\/td>\n<td align=\"center\">NVIDIA A100 (80GB) \/ RTX 4090 (24GB)<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">CUDA<\/td>\n<td align=\"center\">12.1&#043;<\/td>\n<td align=\"center\">12.2&#043;<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">Python<\/td>\n<td align=\"center\">3.9-3.11<\/td>\n<td align=\"center\">3.10<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h4>2.2 \u73af\u5883\u914d\u7f6e\u6b65\u9aa4<\/h4>\n<h5>2.2.1 \u521b\u5efa\u72ec\u7acb Conda \u73af\u5883<\/h5>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token comment\"># \u521b\u5efa Gemini \u4e13\u5c5e\u73af\u5883<\/span><br \/>\nconda create <span class=\"token parameter variable\">-n<\/span> gemini-multimodal <span class=\"token assign-left variable\">python<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token number\">3.10<\/span><br \/>\n<span class=\"token comment\"># \u6fc0\u6d3b\u73af\u5883<\/span><br \/>\nconda activate gemini-multimodal<\/p>\n<h5>2.2.2 \u5b89\u88c5\u6838\u5fc3\u4f9d\u8d56\u5e93<\/h5>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token comment\"># \u57fa\u7840\u4f9d\u8d56<\/span><br \/>\npip <span class=\"token function\">install<\/span> <span class=\"token assign-left variable\">numpy<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">1.26<\/span>.4 <span class=\"token assign-left variable\">pandas<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">2.2<\/span>.2 <span class=\"token assign-left variable\">pillow<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">10.3<\/span>.0 opencv-python<span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">4.9<\/span>.0.80<br \/>\n<span class=\"token comment\"># PyTorch (GPU\u7248\u672c&#xff0c;\u5339\u914dCUDA 12.1)<\/span><br \/>\npip <span class=\"token function\">install<\/span> <span class=\"token assign-left variable\">torch<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">2.3<\/span>.1 <span class=\"token assign-left variable\">torchvision<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">0.18<\/span>.1 <span class=\"token assign-left variable\">torchaudio<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">2.3<\/span>.1 &#8211;index-url https:\/\/download.pytorch.org\/whl\/cu121<br \/>\n<span class=\"token comment\"># Hugging Face \u751f\u6001&#xff08;\u6a21\u578b\u52a0\u8f7d\/\u5904\u7406&#xff09;<\/span><br \/>\npip <span class=\"token function\">install<\/span> <span class=\"token assign-left variable\">transformers<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">4.41<\/span>.2 <span class=\"token assign-left variable\">datasets<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">2.20<\/span>.0 <span class=\"token assign-left variable\">accelerate<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">0.31<\/span>.0<br \/>\n<span class=\"token comment\"># Google Gemini API\/\u672c\u5730\u90e8\u7f72\u4f9d\u8d56<\/span><br \/>\npip <span class=\"token function\">install<\/span> google-generativeai<span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">0.7<\/span>.2 sentence-transformers<span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">2.7<\/span>.0<br \/>\n<span class=\"token comment\"># \u89c6\u9891\u5904\u7406\u4f9d\u8d56<\/span><br \/>\npip <span class=\"token function\">install<\/span> <span class=\"token assign-left variable\">decord<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">0.6<\/span>.0 ffmpeg-python<span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">0.2<\/span>.0<br \/>\n<span class=\"token comment\"># \u5411\u91cf\u5b58\u50a8\u4e0e\u68c0\u7d22<\/span><br \/>\npip <span class=\"token function\">install<\/span> <span class=\"token assign-left variable\">chromadb<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">0.5<\/span>.17 faiss-gpu<span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">1.7<\/span>.4<br \/>\n<span class=\"token comment\"># \u65e5\u5fd7\u4e0e\u914d\u7f6e<\/span><br \/>\npip <span class=\"token function\">install<\/span> python-dotenv<span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">1.0<\/span>.1 <span class=\"token assign-left variable\">loguru<\/span><span class=\"token operator\">&#061;&#061;<\/span><span class=\"token number\">0.7<\/span>.2<\/p>\n<h5>2.2.3 \u6a21\u578b\u4e0b\u8f7d&#xff08;\u672c\u5730\u90e8\u7f72&#xff09;<\/h5>\n<p>Gemini \u63d0\u4f9b\u4e0d\u540c\u91cf\u7ea7\u7684\u6a21\u578b\u7248\u672c&#xff0c;\u53ef\u6839\u636e\u786c\u4ef6\u6761\u4ef6\u9009\u62e9&#xff1a;<\/p>\n<table>\n<tr>\u6a21\u578b\u7248\u672c\u53c2\u6570\u91cf\u786c\u4ef6\u8981\u6c42\u9002\u7528\u573a\u666f<\/tr>\n<tbody>\n<tr>\n<td align=\"center\">Gemini 1.5 Flash<\/td>\n<td align=\"center\">11B<\/td>\n<td align=\"center\">GPU 16GB&#043;<\/td>\n<td align=\"center\">\u8f7b\u91cf\u7ea7\u591a\u6a21\u6001\u5904\u7406<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">Gemini 1.5 Pro<\/td>\n<td align=\"center\">90B<\/td>\n<td align=\"center\">GPU 40GB&#043;<\/td>\n<td align=\"center\">\u9ad8\u7cbe\u5ea6\u591a\u6a21\u6001\u8868\u793a<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">Gemini Nano<\/td>\n<td align=\"center\">1.8B<\/td>\n<td align=\"center\">CPU 16GB&#043;<\/td>\n<td align=\"center\">\u8fb9\u7f18\u8bbe\u5907\u90e8\u7f72<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u901a\u8fc7 Hugging Face \u4e0b\u8f7d\u5f00\u6e90\u9002\u914d\u7248&#xff1a;<\/p>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">from<\/span> huggingface_hub <span class=\"token keyword\">import<\/span> snapshot_download<\/p>\n<p><span class=\"token comment\"># \u4e0b\u8f7d Gemini 1.5 Flash \u591a\u6a21\u6001\u6a21\u578b&#xff08;\u5f00\u6e90\u9002\u914d\u7248&#xff09;<\/span><br \/>\nmodel_dir <span class=\"token operator\">&#061;<\/span> snapshot_download<span class=\"token punctuation\">(<\/span><br \/>\n    repo_id<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;google\/gemma-2-9b-it&#034;<\/span><span class=\"token punctuation\">,<\/span>  <span class=\"token comment\"># Gemini \u5f00\u6e90\u9002\u914d\u7248\u672c<\/span><br \/>\n    cache_dir<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;\/data\/models\/gemini&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    ignore_patterns<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;*.bin.index.json&#034;<\/span><span class=\"token punctuation\">]<\/span>  <span class=\"token comment\"># \u8df3\u8fc7\u4e0d\u5fc5\u8981\u6587\u4ef6<\/span><br \/>\n<span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u6a21\u578b\u4e0b\u8f7d\u5b8c\u6210&#xff0c;\u8def\u5f84&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>model_dir<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<h3>\u4e09\u3001Gemini \u591a\u6a21\u6001\u8868\u793a\u7edf\u4e00\u7684\u6838\u5fc3\u539f\u7406<\/h3>\n<h4>3.1 \u8f93\u5165\u5c42&#xff1a;\u6a21\u6001\u7684\u7edf\u4e00\u5e8f\u5217\u5316<\/h4>\n<p>Gemini \u9996\u5148\u5c06\u4e0d\u540c\u6a21\u6001\u8f6c\u5316\u4e3a\u7edf\u4e00\u7684\u5e8f\u5217 Token&#xff0c;\u4e3a\u540e\u7eed\u7684\u5171\u4eab\u7f16\u7801\u5960\u5b9a\u57fa\u7840&#xff1a;<\/p>\n<table>\n<tr>\u6a21\u6001\u7c7b\u578b\u5904\u7406\u65b9\u5f0fToken \u751f\u6210\u89c4\u5219<\/tr>\n<tbody>\n<tr>\n<td align=\"center\">\u6587\u672c<\/td>\n<td align=\"center\">\u5b57\u8282\u5bf9\u7f16\u7801&#xff08;BPE&#xff09;<\/td>\n<td align=\"center\">\u6309\u8bed\u4e49\u5207\u5206\u4e3a\u5b50\u8bcd&#xff0c;\u751f\u6210 1D Token \u5e8f\u5217<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">\u56fe\u50cf<\/td>\n<td align=\"center\">\u89c6\u89c9\u5206\u5757&#xff08;Patch&#xff09;<\/td>\n<td align=\"center\">\u5c06\u56fe\u50cf\u5207\u5206\u4e3a 16\u00d716 \u50cf\u7d20\u5757&#xff0c;\u751f\u6210 2D Token \u5e8f\u5217<\/td>\n<\/tr>\n<tr>\n<td align=\"center\">\u89c6\u9891<\/td>\n<td align=\"center\">\u65f6\u7a7a\u5206\u5757<\/td>\n<td align=\"center\">\u5148\u6309\u65f6\u95f4\u7ef4\u5ea6\u62c6\u5e27&#xff08;\u5982 1fps&#xff09;&#xff0c;\u518d\u5bf9\u6bcf\u5e27\u5207\u5206\u4e3a 16\u00d716 Patch&#xff0c;\u751f\u6210 3D Token \u5e8f\u5217&#xff08;\u65f6\u95f4 &#043; \u7a7a\u95f4&#xff09;<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u5728\u8fdb\u5165\u795e\u7ecf\u7f51\u7edc\u4e4b\u524d&#xff0c;Gemini \u5fc5\u987b\u6d88\u9664\u4e0d\u540c\u6570\u636e\u5728\u7269\u7406\u5f62\u6001\u4e0a\u7684\u5dee\u5f02&#xff08;\u50cf\u7d20\u77e9\u9635 vs \u5b57\u7b26\u7f16\u7801&#xff09;\u3002\u5b83\u901a\u8fc7 \u901a\u7528\u5e8f\u5217\u5316 (Universal Serialization) \u5c06\u6240\u6709\u6a21\u6001\u8f6c\u5316\u4e3a\u7edf\u4e00\u7684 Token \u5e8f\u5217\u3002<\/p>\n<ul>\n<li>\u6587\u672c (1D \u5e8f\u5217)&#xff1a;\n<ul>\n<li>\u673a\u5236&#xff1a;\u4f7f\u7528 SentencePiece\/BPE \u5206\u8bcd\u3002<\/li>\n<li>\u8f6c\u5316&#xff1a;&#034;\u4e00\u53ea\u732b&#034; <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>           \u2192<\/p>\n<p>          \\\\rightarrow<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.3669em\"><\/span><span class=\"mrel\">\u2192<\/span><\/span><\/span><\/span><\/span> [ID: 882, ID: 331]\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\u56fe\u50cf (2D &#8211;&gt; 1D \u5e8f\u5217)&#xff1a;\n<ul>\n<li>\u673a\u5236&#xff1a;Patch Partition\u3002\u5c06\u9ad8\u6e05\u56fe\u50cf\u5207\u5206\u4e3a\u56fa\u5b9a\u5927\u5c0f&#xff08;\u5982 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>           16<\/p>\n<p>           \u00d7<\/p>\n<p>           16<\/p>\n<p>          16 \\\\times 16<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">16<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6444em\"><\/span><span class=\"mord\">16<\/span><\/span><\/span><\/span><\/span>&#xff09;\u7684\u5c0f\u65b9\u5757 (Patch)\u3002<\/li>\n<li>\u8f6c\u5316&#xff1a;\u6bcf\u4e2a Patch \u88ab\u7ebf\u6027\u6295\u5f71 (Linear Projection) \u5c55\u5e73\u4e3a\u4e00\u4e2a\u5411\u91cf&#xff0c;\u8fd9\u5c31\u76f8\u5f53\u4e8e\u56fe\u50cf\u7684\u4e00\u4e2a\u201c\u5355\u8bcd\u201d\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\u89c6\u9891 (3D &#8211;&gt;1D \u5e8f\u5217)&#xff1a;\n<ul>\n<li>\u673a\u5236&#xff1a;Spatiotemporal Patching (\u65f6\u7a7a\u5206\u5757)\u3002\u4e0d\u4ec5\u5728\u7a7a\u95f4\u4e0a\u5207\u5206 Patch&#xff0c;\u8fd8\u5728\u65f6\u95f4\u8f74\u4e0a\u91c7\u6837\u3002<\/li>\n<li>\u8f6c\u5316&#xff1a;\u89c6\u9891\u88ab\u89c6\u4e3a\u4e00\u7cfb\u5217\u968f\u65f6\u95f4\u53d8\u5316\u7684\u56fe\u50cf Patch&#xff0c;\u6700\u7ec8\u4e5f\u88ab\u62c9\u5e73\u4e3a\u4e00\u6761\u957f\u957f\u7684 Token \u5e8f\u5217\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>\u7edf\u4e00\u7ed3\u679c&#xff1a;\u65e0\u8bba\u8f93\u5165\u662f\u6587\u5b57\u3001JPG \u8fd8\u662f MP4&#xff0c;\u5728\u6a21\u578b\u5165\u53e3\u5904&#xff0c;\u5b83\u4eec\u90fd\u53d8\u6210\u4e86\u5f62\u72b6\u76f8\u540c\u7684\u5f20\u91cf [Batch_Size, Sequence_Length, Dimension]\u3002<\/p>\n<p>\u793a\u4f8b&#xff1a;\u89c6\u9891\u5e8f\u5217\u5316\u8fc7\u7a0b<\/p>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">import<\/span> cv2<br \/>\n<span class=\"token keyword\">import<\/span> numpy <span class=\"token keyword\">as<\/span> np<\/p>\n<p><span class=\"token keyword\">def<\/span> <span class=\"token function\">video_to_patches<\/span><span class=\"token punctuation\">(<\/span>video_path<span class=\"token punctuation\">,<\/span> frame_rate<span class=\"token operator\">&#061;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> patch_size<span class=\"token operator\">&#061;<\/span><span class=\"token number\">16<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u5c06\u89c6\u9891\u8f6c\u5316\u4e3a\u65f6\u7a7aPatch\u5e8f\u5217&#034;&#034;&#034;<\/span><br \/>\n    <span class=\"token comment\"># 1. \u8bfb\u53d6\u89c6\u9891\u5e76\u6309\u5e27\u7387\u62bd\u5e27<\/span><br \/>\n    cap <span class=\"token operator\">&#061;<\/span> cv2<span class=\"token punctuation\">.<\/span>VideoCapture<span class=\"token punctuation\">(<\/span>video_path<span class=\"token punctuation\">)<\/span><br \/>\n    frames <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">]<\/span><br \/>\n    frame_idx <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">0<\/span><br \/>\n    <span class=\"token keyword\">while<\/span> cap<span class=\"token punctuation\">.<\/span>isOpened<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        ret<span class=\"token punctuation\">,<\/span> frame <span class=\"token operator\">&#061;<\/span> cap<span class=\"token punctuation\">.<\/span>read<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> <span class=\"token keyword\">not<\/span> ret<span class=\"token punctuation\">:<\/span><br \/>\n            <span class=\"token keyword\">break<\/span><br \/>\n        <span class=\"token comment\"># \u6309\u5e27\u7387\u91c7\u6837<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> frame_idx <span class=\"token operator\">%<\/span> <span class=\"token builtin\">int<\/span><span class=\"token punctuation\">(<\/span>cap<span class=\"token punctuation\">.<\/span>get<span class=\"token punctuation\">(<\/span>cv2<span class=\"token punctuation\">.<\/span>CAP_PROP_FPS<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">\/<\/span> frame_rate<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token number\">0<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            <span class=\"token comment\"># \u8f6c\u4e3aRGB\u5e76Resize\u5230\u56fa\u5b9a\u5c3a\u5bf8<\/span><br \/>\n            frame <span class=\"token operator\">&#061;<\/span> cv2<span class=\"token punctuation\">.<\/span>cvtColor<span class=\"token punctuation\">(<\/span>frame<span class=\"token punctuation\">,<\/span> cv2<span class=\"token punctuation\">.<\/span>COLOR_BGR2RGB<span class=\"token punctuation\">)<\/span><br \/>\n            frame <span class=\"token operator\">&#061;<\/span> cv2<span class=\"token punctuation\">.<\/span>resize<span class=\"token punctuation\">(<\/span>frame<span class=\"token punctuation\">,<\/span> <span class=\"token punctuation\">(<\/span><span class=\"token number\">256<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">256<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># 16\u00d716 Patch \u00d7 16<\/span><br \/>\n            frames<span class=\"token punctuation\">.<\/span>append<span class=\"token punctuation\">(<\/span>frame<span class=\"token punctuation\">)<\/span><br \/>\n        frame_idx <span class=\"token operator\">&#043;&#061;<\/span> <span class=\"token number\">1<\/span><br \/>\n    cap<span class=\"token punctuation\">.<\/span>release<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># 2. \u751f\u6210\u65f6\u7a7aPatch<\/span><br \/>\n    patches <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">]<\/span><br \/>\n    <span class=\"token keyword\">for<\/span> t<span class=\"token punctuation\">,<\/span> frame <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">enumerate<\/span><span class=\"token punctuation\">(<\/span>frames<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token comment\"># \u7a7a\u95f4\u7ef4\u5ea6\u5207\u5206Patch<\/span><br \/>\n        <span class=\"token keyword\">for<\/span> y <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">range<\/span><span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> frame<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span> patch_size<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            <span class=\"token keyword\">for<\/span> x <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">range<\/span><span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> frame<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">[<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span> patch_size<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                patch <span class=\"token operator\">&#061;<\/span> frame<span class=\"token punctuation\">[<\/span>y<span class=\"token punctuation\">:<\/span>y<span class=\"token operator\">&#043;<\/span>patch_size<span class=\"token punctuation\">,<\/span> x<span class=\"token punctuation\">:<\/span>x<span class=\"token operator\">&#043;<\/span>patch_size<span class=\"token punctuation\">]<\/span><br \/>\n                <span class=\"token comment\"># \u8bb0\u5f55\u65f6\u7a7a\u5750\u6807 (t, y, x)<\/span><br \/>\n                patches<span class=\"token punctuation\">.<\/span>append<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">{<\/span><br \/>\n                    <span class=\"token string\">&#034;patch&#034;<\/span><span class=\"token punctuation\">:<\/span> patch<span class=\"token punctuation\">.<\/span>flatten<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n                    <span class=\"token string\">&#034;coords&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token punctuation\">(<\/span>t<span class=\"token punctuation\">,<\/span> y<span class=\"token operator\">\/\/<\/span>patch_size<span class=\"token punctuation\">,<\/span> x<span class=\"token operator\">\/\/<\/span>patch_size<span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token punctuation\">}<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># 3. \u8f6c\u5316\u4e3a\u7edf\u4e00\u5e8f\u5217&#xff08;\u65f6\u95f4\u2192\u7a7a\u95f4\u5c55\u5f00&#xff09;<\/span><br \/>\n    patch_sequence <span class=\"token operator\">&#061;<\/span> np<span class=\"token punctuation\">.<\/span>array<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">[<\/span>p<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;patch&#034;<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token keyword\">for<\/span> p <span class=\"token keyword\">in<\/span> patches<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">return<\/span> patch_sequence<span class=\"token punctuation\">,<\/span> frames<\/p>\n<p><span class=\"token comment\"># \u6d4b\u8bd5\u89c6\u9891\u5e8f\u5217\u5316<\/span><br \/>\nvideo_path <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;sample_video.mp4&#034;<\/span><br \/>\npatch_seq<span class=\"token punctuation\">,<\/span> frames <span class=\"token operator\">&#061;<\/span> video_to_patches<span class=\"token punctuation\">(<\/span>video_path<span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u89c6\u9891\u62bd\u5e27\u6570\u91cf&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span><span class=\"token builtin\">len<\/span><span class=\"token punctuation\">(<\/span>frames<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u751f\u6210Patch\u5e8f\u5217\u957f\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span><span class=\"token builtin\">len<\/span><span class=\"token punctuation\">(<\/span>patch_seq<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u5355\u4e2aPatch\u7ef4\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>patch_seq<span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<h4>3.2 \u7f16\u7801\u5c42&#xff1a;\u5171\u4eab Transformer \u4e0e\u8de8\u6a21\u6001\u6ce8\u610f\u529b<\/h4>\n<p>Gemini \u7684\u6838\u5fc3\u662f\u5171\u4eab\u7684 Transformer \u7f16\u7801\u5668&#xff0c;\u6240\u6709\u6a21\u6001\u7684 Token \u5e8f\u5217\u8f93\u5165\u540c\u4e00\u5957 Transformer \u7f51\u7edc&#xff0c;\u5e76\u901a\u8fc7\u8de8\u6a21\u6001\u6ce8\u610f\u529b\u673a\u5236\u5b9e\u73b0\u8bed\u4e49\u5bf9\u9f50&#xff1a;<\/p>\n<li>\u57fa\u7840\u6ce8\u610f\u529b\u5c42&#xff1a;\u5bf9\u5355\u4e00\u6a21\u6001 Token \u8fdb\u884c\u81ea\u6ce8\u610f\u529b\u8ba1\u7b97&#xff08;\u5982\u6587\u672c\u5185\u90e8\u7684\u8bed\u4e49\u5173\u8054\u3001\u56fe\u50cf Patch \u7684\u7a7a\u95f4\u5173\u8054&#xff09;<\/li>\n<li>\u4ea4\u53c9\u6ce8\u610f\u529b\u5c42&#xff1a;\u5efa\u6a21\u4e0d\u540c\u6a21\u6001 Token \u95f4\u7684\u5173\u8054&#xff08;\u5982 \u201c\u7ea2\u8272\u6c7d\u8f66\u201d \u6587\u672c Token \u4e0e\u56fe\u50cf\u4e2d\u7ea2\u8272\u6c7d\u8f66\u533a\u57df Patch \u7684\u5173\u8054&#xff09;<\/li>\n<li>\u65f6\u5e8f\u6ce8\u610f\u529b\u5c42&#xff08;\u89c6\u9891\u4e13\u7528&#xff09;&#xff1a;\u5bf9\u89c6\u9891\u7684\u65f6\u5e8f Token \u6dfb\u52a0\u65f6\u95f4\u7ef4\u5ea6\u7684\u6ce8\u610f\u529b\u6743\u91cd&#xff0c;\u6355\u6349\u5e27\u95f4\u52a8\u6001<\/li>\n<p>graph TD<br \/>\n    A<span class=\"token punctuation\">[<\/span>\u6587\u672c Token<span class=\"token punctuation\">]<\/span> &#8212;<span class=\"token operator\">&gt;<\/span> D<span class=\"token punctuation\">{<\/span>\u5171\u4eab Transformer<span class=\"token punctuation\">}<\/span><br \/>\n    B<span class=\"token punctuation\">[<\/span>\u56fe\u50cf Patch Token<span class=\"token punctuation\">]<\/span> &#8212;<span class=\"token operator\">&gt;<\/span> D<br \/>\n    C<span class=\"token punctuation\">[<\/span>\u89c6\u9891\u65f6\u7a7a Token<span class=\"token punctuation\">]<\/span> &#8212;<span class=\"token operator\">&gt;<\/span> D<\/p>\n<p>    subgraph <span class=\"token string\">&#034;Gemini \u7edf\u4e00\u7f16\u7801\u5668&#034;<\/span><br \/>\n    D &#8211;\u81ea\u6ce8\u610f\u529b\u673a\u5236 <span class=\"token punctuation\">(<\/span>Self-Attention<span class=\"token punctuation\">)<\/span>&#8212;<span class=\"token operator\">&gt;<\/span> E<span class=\"token punctuation\">[<\/span>\u4ea4\u4e92\u4e0e\u878d\u5408<span class=\"token punctuation\">]<\/span><br \/>\n    E &#8211;\u524d\u9988\u7f51\u7edc <span class=\"token punctuation\">(<\/span>FFN<span class=\"token punctuation\">)<\/span>&#8212;<span class=\"token operator\">&gt;<\/span> F<span class=\"token punctuation\">[<\/span>\u8bed\u4e49\u7279\u5f81\u63d0\u53d6<span class=\"token punctuation\">]<\/span><br \/>\n    end<\/p>\n<p>    F &#8212;<span class=\"token operator\">&gt;<\/span> G<span class=\"token punctuation\">[<\/span>\u7edf\u4e00\u5411\u91cf\u7a7a\u95f4<span class=\"token punctuation\">]<\/span><\/p>\n<p>\u6838\u5fc3\u4ee3\u7801&#xff1a;\u8de8\u6a21\u6001\u6ce8\u610f\u529b\u5b9e\u73b0<\/p>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">import<\/span> torch<br \/>\n<span class=\"token keyword\">import<\/span> torch<span class=\"token punctuation\">.<\/span>nn <span class=\"token keyword\">as<\/span> nn<br \/>\n<span class=\"token keyword\">import<\/span> torch<span class=\"token punctuation\">.<\/span>nn<span class=\"token punctuation\">.<\/span>functional <span class=\"token keyword\">as<\/span> F<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">CrossModalAttention<\/span><span class=\"token punctuation\">(<\/span>nn<span class=\"token punctuation\">.<\/span>Module<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> d_model<span class=\"token operator\">&#061;<\/span><span class=\"token number\">2048<\/span><span class=\"token punctuation\">,<\/span> n_heads<span class=\"token operator\">&#061;<\/span><span class=\"token number\">16<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token builtin\">super<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>__init__<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>d_model <span class=\"token operator\">&#061;<\/span> d_model<br \/>\n        self<span class=\"token punctuation\">.<\/span>n_heads <span class=\"token operator\">&#061;<\/span> n_heads<br \/>\n        self<span class=\"token punctuation\">.<\/span>head_dim <span class=\"token operator\">&#061;<\/span> d_model <span class=\"token operator\">\/\/<\/span> n_heads<\/p>\n<p>        <span class=\"token comment\"># \u5171\u4eab\u7684\u6ce8\u610f\u529b\u6295\u5f71\u5c42<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>q_proj <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Linear<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>k_proj <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Linear<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>v_proj <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Linear<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>out_proj <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Linear<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">forward<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> text_embeds<span class=\"token punctuation\">,<\/span> visual_embeds<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;<br \/>\n        \u8de8\u6a21\u6001\u6ce8\u610f\u529b\u8ba1\u7b97&#xff1a;\u4ee5\u6587\u672c\u4e3aQuery&#xff0c;\u89c6\u89c9&#xff08;\u56fe\u50cf\/\u89c6\u9891&#xff09;\u4e3aKey\/Value<br \/>\n        Args:<br \/>\n            text_embeds: [batch, text_len, d_model] \u6587\u672c\u5d4c\u5165<br \/>\n            visual_embeds: [batch, visual_len, d_model] \u89c6\u89c9\u5d4c\u5165<br \/>\n        Returns:<br \/>\n            cross_embeds: [batch, text_len &#043; visual_len, d_model] \u878d\u5408\u540e\u7684\u5d4c\u5165<br \/>\n        &#034;&#034;&#034;<\/span><br \/>\n        batch_size <span class=\"token operator\">&#061;<\/span> text_embeds<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><\/p>\n<p>        <span class=\"token comment\"># 1. \u6295\u5f71\u4e3aQuery\/Key\/Value<\/span><br \/>\n        q <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>q_proj<span class=\"token punctuation\">(<\/span>text_embeds<span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u6587\u672c\u4f5c\u4e3aQuery<\/span><br \/>\n        k <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>k_proj<span class=\"token punctuation\">(<\/span>visual_embeds<span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u89c6\u89c9\u4f5c\u4e3aKey<\/span><br \/>\n        v <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>v_proj<span class=\"token punctuation\">(<\/span>visual_embeds<span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u89c6\u89c9\u4f5c\u4e3aValue<\/span><\/p>\n<p>        <span class=\"token comment\"># 2. \u5206\u62c6\u6ce8\u610f\u529b\u5934<\/span><br \/>\n        q <span class=\"token operator\">&#061;<\/span> q<span class=\"token punctuation\">.<\/span>view<span class=\"token punctuation\">(<\/span>batch_size<span class=\"token punctuation\">,<\/span> <span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>n_heads<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>head_dim<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>transpose<span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        k <span class=\"token operator\">&#061;<\/span> k<span class=\"token punctuation\">.<\/span>view<span class=\"token punctuation\">(<\/span>batch_size<span class=\"token punctuation\">,<\/span> <span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>n_heads<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>head_dim<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>transpose<span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        v <span class=\"token operator\">&#061;<\/span> v<span class=\"token punctuation\">.<\/span>view<span class=\"token punctuation\">(<\/span>batch_size<span class=\"token punctuation\">,<\/span> <span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>n_heads<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>head_dim<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>transpose<span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 3. \u8ba1\u7b97\u6ce8\u610f\u529b\u5206\u6570&#xff08;\u7f29\u653e\u70b9\u79ef&#xff09;<\/span><br \/>\n        scores <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>matmul<span class=\"token punctuation\">(<\/span>q<span class=\"token punctuation\">,<\/span> k<span class=\"token punctuation\">.<\/span>transpose<span class=\"token punctuation\">(<\/span><span class=\"token operator\">&#8211;<\/span><span class=\"token number\">2<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span> <span class=\"token operator\">\/<\/span> torch<span class=\"token punctuation\">.<\/span>sqrt<span class=\"token punctuation\">(<\/span>torch<span class=\"token punctuation\">.<\/span>tensor<span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">.<\/span>head_dim<span class=\"token punctuation\">,<\/span> dtype<span class=\"token operator\">&#061;<\/span>torch<span class=\"token punctuation\">.<\/span>float32<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        attn_weights <span class=\"token operator\">&#061;<\/span> F<span class=\"token punctuation\">.<\/span>softmax<span class=\"token punctuation\">(<\/span>scores<span class=\"token punctuation\">,<\/span> dim<span class=\"token operator\">&#061;<\/span><span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 4. \u6ce8\u610f\u529b\u52a0\u6743\u6c42\u548c<\/span><br \/>\n        cross_output <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>matmul<span class=\"token punctuation\">(<\/span>attn_weights<span class=\"token punctuation\">,<\/span> v<span class=\"token punctuation\">)<\/span><br \/>\n        cross_output <span class=\"token operator\">&#061;<\/span> cross_output<span class=\"token punctuation\">.<\/span>transpose<span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>contiguous<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>view<span class=\"token punctuation\">(<\/span>batch_size<span class=\"token punctuation\">,<\/span> <span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>d_model<span class=\"token punctuation\">)<\/span><br \/>\n        cross_output <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>out_proj<span class=\"token punctuation\">(<\/span>cross_output<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 5. \u62fc\u63a5\u6587\u672c\u4e0e\u89c6\u89c9\u5d4c\u5165&#xff08;\u4fdd\u6301\u5e8f\u5217\u5b8c\u6574\u6027&#xff09;<\/span><br \/>\n        cross_embeds <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>cat<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">[<\/span>text_embeds <span class=\"token operator\">&#043;<\/span> cross_output<span class=\"token punctuation\">,<\/span> visual_embeds<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span> dim<span class=\"token operator\">&#061;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> cross_embeds<\/p>\n<p><span class=\"token comment\"># \u6d4b\u8bd5\u8de8\u6a21\u6001\u6ce8\u610f\u529b<\/span><br \/>\n<span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;__main__&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token comment\"># \u6a21\u62df\u6587\u672c\u548c\u89c6\u89c9\u5d4c\u5165<\/span><br \/>\n    batch_size <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">2<\/span><br \/>\n    text_len <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">32<\/span><br \/>\n    visual_len <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">256<\/span><br \/>\n    d_model <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">2048<\/span><\/p>\n<p>    text_embeds <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>randn<span class=\"token punctuation\">(<\/span>batch_size<span class=\"token punctuation\">,<\/span> text_len<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><br \/>\n    visual_embeds <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>randn<span class=\"token punctuation\">(<\/span>batch_size<span class=\"token punctuation\">,<\/span> visual_len<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u521d\u59cb\u5316\u8de8\u6a21\u6001\u6ce8\u610f\u529b\u5c42<\/span><br \/>\n    cross_attn <span class=\"token operator\">&#061;<\/span> CrossModalAttention<span class=\"token punctuation\">(<\/span>d_model<span class=\"token operator\">&#061;<\/span>d_model<span class=\"token punctuation\">,<\/span> n_heads<span class=\"token operator\">&#061;<\/span><span class=\"token number\">16<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    cross_embeds <span class=\"token operator\">&#061;<\/span> cross_attn<span class=\"token punctuation\">(<\/span>text_embeds<span class=\"token punctuation\">,<\/span> visual_embeds<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u878d\u5408\u540e\u5d4c\u5165\u7ef4\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>cross_embeds<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># [2, 32&#043;256&#061;288, 2048]<\/span><\/p>\n<h4>3.3 \u8f93\u51fa\u5c42&#xff1a;\u7edf\u4e00\u7684\u5411\u91cf\u8868\u793a\u7a7a\u95f4<\/h4>\n<p>Gemini \u901a\u8fc7\u4ee5\u4e0b\u65b9\u5f0f\u4fdd\u8bc1\u6240\u6709\u6a21\u6001\u8f93\u51fa\u5411\u91cf\u7684\u7edf\u4e00\u53ef\u6bd4\u8f83\u6027&#xff1a;<\/p>\n<li>\u5411\u91cf\u5f52\u4e00\u5316&#xff1a;\u6240\u6709\u6a21\u6001\u7684\u8f93\u51fa\u5411\u91cf\u7ecf\u8fc7 L2 \u5f52\u4e00\u5316&#xff0c;\u786e\u4fdd\u5411\u91cf\u957f\u5ea6\u4e00\u81f4<\/li>\n<li>\u7ef4\u5ea6\u5bf9\u9f50&#xff1a;\u5f3a\u5236\u6240\u6709\u6a21\u6001\u7684\u8f93\u51fa\u5411\u91cf\u7ef4\u5ea6\u76f8\u540c&#xff08;\u5982 2048 \u7ef4&#xff09;<\/li>\n<li>\u8bed\u4e49\u6821\u51c6&#xff1a;\u901a\u8fc7\u5927\u89c4\u6a21\u8de8\u6a21\u6001\u5bf9\u6bd4\u5b66\u4e60&#xff0c;\u4f7f\u4e0d\u540c\u6a21\u6001\u7684\u8bed\u4e49\u76f8\u4f3c\u5185\u5bb9\u5728\u5411\u91cf\u7a7a\u95f4\u4e2d\u8ddd\u79bb\u66f4\u8fd1<\/li>\n<p>\u793a\u4f8b&#xff1a;\u591a\u6a21\u6001\u5411\u91cf\u5f52\u4e00\u5316<\/p>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">def<\/span> <span class=\"token function\">normalize_embeddings<\/span><span class=\"token punctuation\">(<\/span>embeddings<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;L2\u5f52\u4e00\u5316&#xff0c;\u4fdd\u8bc1\u5411\u91cf\u957f\u5ea6\u4e3a1&#034;&#034;&#034;<\/span><br \/>\n    norm <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>norm<span class=\"token punctuation\">(<\/span>embeddings<span class=\"token punctuation\">,<\/span> p<span class=\"token operator\">&#061;<\/span><span class=\"token number\">2<\/span><span class=\"token punctuation\">,<\/span> dim<span class=\"token operator\">&#061;<\/span><span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> keepdim<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">return<\/span> embeddings <span class=\"token operator\">\/<\/span> <span class=\"token punctuation\">(<\/span>norm <span class=\"token operator\">&#043;<\/span> <span class=\"token number\">1e-8<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># \u6a21\u62df\u4e0d\u540c\u6a21\u6001\u7684\u539f\u59cb\u5d4c\u5165<\/span><br \/>\ntext_emb <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>randn<span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2048<\/span><span class=\"token punctuation\">)<\/span>    <span class=\"token comment\"># \u6587\u672c\u5d4c\u5165<\/span><br \/>\nimage_emb <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>randn<span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2048<\/span><span class=\"token punctuation\">)<\/span>   <span class=\"token comment\"># \u56fe\u50cf\u5d4c\u5165<\/span><br \/>\nvideo_emb <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>randn<span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2048<\/span><span class=\"token punctuation\">)<\/span>   <span class=\"token comment\"># \u89c6\u9891\u5d4c\u5165<\/span><\/p>\n<p><span class=\"token comment\"># \u5f52\u4e00\u5316<\/span><br \/>\ntext_emb_norm <span class=\"token operator\">&#061;<\/span> normalize_embeddings<span class=\"token punctuation\">(<\/span>text_emb<span class=\"token punctuation\">)<\/span><br \/>\nimage_emb_norm <span class=\"token operator\">&#061;<\/span> normalize_embeddings<span class=\"token punctuation\">(<\/span>image_emb<span class=\"token punctuation\">)<\/span><br \/>\nvideo_emb_norm <span class=\"token operator\">&#061;<\/span> normalize_embeddings<span class=\"token punctuation\">(<\/span>video_emb<span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># \u9a8c\u8bc1\u5f52\u4e00\u5316\u7ed3\u679c<\/span><br \/>\n<span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u6587\u672c\u5411\u91cf\u957f\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>torch<span class=\"token punctuation\">.<\/span>norm<span class=\"token punctuation\">(<\/span>text_emb_norm<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>item<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u5e94\u4e3a 1.0<\/span><br \/>\n<span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u56fe\u50cf\u5411\u91cf\u957f\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>torch<span class=\"token punctuation\">.<\/span>norm<span class=\"token punctuation\">(<\/span>image_emb_norm<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>item<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u5e94\u4e3a 1.0<\/span><br \/>\n<span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u89c6\u9891\u5411\u91cf\u957f\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>torch<span class=\"token punctuation\">.<\/span>norm<span class=\"token punctuation\">(<\/span>video_emb_norm<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>item<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u5e94\u4e3a 1.0<\/span><\/p>\n<p><span class=\"token comment\"># \u8ba1\u7b97\u8de8\u6a21\u6001\u76f8\u4f3c\u5ea6<\/span><br \/>\ntext_image_sim <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>cosine_similarity<span class=\"token punctuation\">(<\/span>text_emb_norm<span class=\"token punctuation\">,<\/span> image_emb_norm<span class=\"token punctuation\">)<\/span><br \/>\ntext_video_sim <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>cosine_similarity<span class=\"token punctuation\">(<\/span>text_emb_norm<span class=\"token punctuation\">,<\/span> video_emb_norm<span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u6587\u672c-\u56fe\u50cf\u76f8\u4f3c\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>text_image_sim<span class=\"token punctuation\">.<\/span>item<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u6587\u672c-\u89c6\u9891\u76f8\u4f3c\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>text_video_sim<span class=\"token punctuation\">.<\/span>item<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<h5>3.4 \u603b\u7ed3&#xff1a;\u4f20\u7edf\u67b6\u6784 vs Gemini \u67b6\u6784<\/h5>\n<p>\u4e3a\u4e86\u66f4\u76f4\u89c2\u5730\u7406\u89e3&#xff0c;\u6211\u4eec\u53ef\u4ee5\u5bf9\u6bd4\u4e24\u79cd\u67b6\u6784&#xff1a;<\/p>\n<table>\n<tr>\u7279\u6027\u4f20\u7edf\u591a\u6a21\u6001 (\u5982 CLIP &#043; LLM)Gemini (\u539f\u751f\u591a\u6a21\u6001)<\/tr>\n<tbody>\n<tr>\n<td>\u7f16\u7801\u5668<\/td>\n<td>\u5272\u88c2&#xff1a;\u6709\u4e00\u4e2a\u89c6\u89c9\u7f16\u7801\u5668 (ViT) \u548c\u4e00\u4e2a\u6587\u672c\u7f16\u7801\u5668 (BERT\/GPT)<\/td>\n<td>\u7edf\u4e00&#xff1a;\u53ea\u6709\u4e00\u4e2a\u5171\u4eab\u7684 Transformer<\/td>\n<\/tr>\n<tr>\n<td>\u4fe1\u606f\u4ea4\u4e92<\/td>\n<td>\u665a\u671f\u878d\u5408&#xff1a;\u4ec5\u5728\u6700\u540e\u8f93\u51fa\u5c42\u6216\u901a\u8fc7 Adapter \u7b80\u5355\u7684\u62fc\u63a5<\/td>\n<td>\u5168\u7a0b\u878d\u5408&#xff1a;\u4ece\u7b2c\u4e00\u5c42\u5f00\u59cb&#xff0c;\u6587\u672c\u548c\u56fe\u50cf Token \u5c31\u6df7\u5408\u8ba1\u7b97\u6ce8\u610f\u529b<\/td>\n<\/tr>\n<tr>\n<td>\u89c6\u9891\u5904\u7406<\/td>\n<td>\u901a\u5e38\u89c6\u4e3a\u201c\u56fe\u50cf\u7684\u5e73\u5747\u503c\u201d&#xff0c;\u4e22\u5931\u65f6\u5e8f\u4fe1\u606f<\/td>\n<td>\u65f6\u7a7a\u5efa\u6a21&#xff1a;\u5c06\u65f6\u95f4\u4f5c\u4e3a Token \u7684\u4e00\u4e2a\u7ef4\u5ea6&#xff0c;\u7406\u89e3\u52a8\u4f5c\u548c\u53d8\u5316<\/td>\n<\/tr>\n<tr>\n<td>\u4f18\u52bf<\/td>\n<td>\u8bad\u7ec3\u6210\u672c\u8f83\u4f4e&#xff0c;\u6a21\u5757\u53ef\u66ff\u6362<\/td>\n<td>\u7406\u89e3\u529b\u66f4\u5f3a&#xff0c;\u80fd\u5904\u7406\u590d\u6742\u7684\u56fe\u6587\u63a8\u7406\u548c\u957f\u89c6\u9891\u7406\u89e3<\/td>\n<\/tr>\n<tr>\n<td>\u4ee3\u7801\u4f53\u73b0<\/td>\n<td>\u9700\u8981\u52a0\u8f7d\u4e24\u4e2a\u6a21\u578b\u6743\u91cd\u6587\u4ef6<\/td>\n<td>\u53ea\u52a0\u8f7d\u4e00\u4e2a\u6a21\u578b\u6743\u91cd (\u5982 google\/gemma-2-9b-it)<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h3>\u56db\u3001\u5b9e\u6218&#xff1a;Gemini \u7edf\u4e00\u6587\u672c \/ \u56fe\u50cf \/ \u89c6\u9891\u8868\u793a\u7684\u5b8c\u6574\u5b9e\u73b0<\/h3>\n<h4>4.1 \u9879\u76ee\u7ed3\u6784\u8bbe\u8ba1<\/h4>\n<p>gemini-multimodal\/<br \/>\n\u251c\u2500\u2500 .env                  <span class=\"token comment\"># [\u5bc6\u94a5\u91d1\u5e93] \u5b58\u653e API Key\u3001\u672c\u5730\u6a21\u578b\u7edd\u5bf9\u8def\u5f84&#xff0c;\u9694\u79bb\u654f\u611f\u73af\u5883\u914d\u7f6e<\/span><br \/>\n\u251c\u2500\u2500 requirements.txt      <span class=\"token comment\"># [\u4f9d\u8d56\u6e05\u5355] \u5b9a\u4e49 PyTorch\u3001Transformers\u3001ChromaDB \u7b49\u6838\u5fc3\u8ba1\u7b97\u5e93\u7248\u672c<\/span><br \/>\n\u251c\u2500\u2500 config.py             <span class=\"token comment\"># [\u7edf\u4e00\u6807\u51c6] \u5b9a\u4e49\u8de8\u6a21\u6001\u5171\u4eab\u7684\u7ef4\u5ea6(D_MODEL)\u3001\u6700\u5927\u5e8f\u5217\u957f\u5ea6\u53ca\u786c\u4ef6\u53c2\u6570<\/span><br \/>\n\u251c\u2500\u2500 main.py               <span class=\"token comment\"># [\u4e2d\u63a7\u53f0] \u7cfb\u7edf\u5165\u53e3&#xff0c;\u8d1f\u8d23\u8c03\u5ea6\u9884\u5904\u7406\u3001\u7f16\u7801\u4e0e\u68c0\u7d22\u6a21\u5757\u7684\u6d41\u6c34\u7ebf\u4ea4\u4e92<\/span><br \/>\n\u2502<br \/>\n\u251c\u2500\u2500 core\/                 <span class=\"token comment\"># [\u591a\u6a21\u6001\u5f15\u64ce] \u6838\u5fc3\u7b97\u6cd5\u5b9e\u73b0\u5e93&#xff08;\u672c\u7cfb\u7edf\u7684\u201c\u5fc3\u810f\u201d&#xff09;<\/span><br \/>\n\u2502   \u251c\u2500\u2500 __init__.py       <span class=\"token comment\"># \u5305\u521d\u59cb\u5316\u6587\u4ef6<\/span><br \/>\n\u2502   \u251c\u2500\u2500 model_setup.py    <span class=\"token comment\"># [\u5171\u4eab\u9aa8\u67b6] \u52a0\u8f7d Gemini \u7edf\u4e00 Transformer \u7f16\u7801\u5668\u4e0e\u8de8\u6a21\u6001\u9002\u914d\u5c42<\/span><br \/>\n\u2502   \u251c\u2500\u2500 modal_processor.py<span class=\"token comment\"># [\u5e8f\u5217\u5316\u5de5\u5382] \u5c06\u6587\u672c\/\u56fe\u50cf\/\u89c6\u9891\u8f6c\u5316\u4e3a\u7edf\u4e00\u683c\u5f0f\u7684 Token \u5e8f\u5217<\/span><br \/>\n\u2502   \u251c\u2500\u2500 embedding.py      <span class=\"token comment\"># [\u7edf\u4e00\u6620\u5c04] \u6267\u884c\u524d\u5411\u4f20\u64ad&#xff0c;\u8f93\u51fa\u5e76\u5f52\u4e00\u5316\u591a\u6a21\u6001\u5411\u91cf&#xff08;\u8bed\u4e49\u5bf9\u9f50\u6838\u5fc3&#xff09;<\/span><br \/>\n\u2502   \u2514\u2500\u2500 retriever.py      <span class=\"token comment\"># [\u6df7\u5408\u8bb0\u5fc6] \u7ba1\u7406\u5411\u91cf\u6570\u636e\u5e93&#xff0c;\u6267\u884c\u8de8\u6a21\u6001&#xff08;\u5982\u6587\u641c\u56fe\u3001\u56fe\u641c\u89c6\u9891&#xff09;\u68c0\u7d22<\/span><br \/>\n\u2502<br \/>\n\u251c\u2500\u2500 data\/                 <span class=\"token comment\"># [\u5f02\u6784\u6570\u636e\u6e90] \u5b58\u653e\u5f85\u5904\u7406\u7684\u591a\u6a21\u6001\u539f\u59cb\u6587\u4ef6<\/span><br \/>\n\u2502   \u251c\u2500\u2500 text\/             <span class=\"token comment\"># [\u6587\u672c\u8bed\u6599] .txt \/ .json \u683c\u5f0f\u7684\u6587\u672c\u63cf\u8ff0<\/span><br \/>\n\u2502   \u251c\u2500\u2500 images\/           <span class=\"token comment\"># [\u9759\u6001\u89c6\u89c9] .jpg \/ .png \u683c\u5f0f\u7684\u56fe\u50cf\u6587\u4ef6<\/span><br \/>\n\u2502   \u2514\u2500\u2500 videos\/           <span class=\"token comment\"># [\u52a8\u6001\u89c6\u89c9] .mp4 \/ .avi \u683c\u5f0f\u7684\u89c6\u9891\u7247\u6bb5<\/span><br \/>\n\u2502<br \/>\n\u251c\u2500\u2500 embeddings_db\/        <span class=\"token comment\"># [\u8bed\u4e49\u7a7a\u95f4] \u6301\u4e45\u5316\u7684\u5411\u91cf\u5b58\u50a8<\/span><br \/>\n\u2502   \u2514\u2500\u2500 chroma\/           <span class=\"token comment\"># [ChromaDB] \u5b58\u50a8\u5df2\u5bf9\u9f50\u7684\u9ad8\u7ef4\u5411\u91cf\u53ca\u5176\u5143\u6570\u636e&#xff08;Metadata&#xff09;<\/span><br \/>\n\u2502<br \/>\n\u2514\u2500\u2500 logs\/                 <span class=\"token comment\"># [\u8fd0\u884c\u8bb0\u5f55] <\/span><br \/>\n    \u2514\u2500\u2500 multimodal.log    <span class=\"token comment\"># \u8bb0\u5f55\u7f16\u7801\u5ef6\u8fdf\u3001\u6a21\u6001\u8f6c\u6362\u9519\u8bef\u53ca\u68c0\u7d22\u5339\u914d\u5ea6\u65e5\u5fd7<\/span><\/p>\n<h5>4.1.2 \u6838\u5fc3\u6a21\u5757\u6df1\u5ea6\u89e3\u6790 (The Components)<\/h5>\n<p>\u8fd9\u4e9b\u6587\u4ef6\u5171\u540c\u6784\u6210\u4e86\u4e00\u4e2a\u80fd\u591f\u7406\u89e3\u591a\u79cd\u6a21\u6001\u7684\u201c\u7edf\u4e00\u5927\u8111\u201d\u3002\u4e0e\u4f20\u7edf\u591a\u6a21\u6001\u9879\u76ee\u4e0d\u540c&#xff0c;\u8fd9\u91cc\u7684\u6838\u5fc3\u903b\u8f91\u5728\u4e8e\u5982\u4f55\u62b9\u5e73\u4e0d\u540c\u6a21\u6001\u4e4b\u95f4\u7684\u5dee\u5f02\u3002<\/p>\n<p>1. \u57fa\u7840\u8bbe\u65bd\u4e0e\u6807\u51c6\u5b9a\u4e49 (The Infrastructure)<\/p>\n<ul>\n<li>config.py (\u7edf\u4e00\u6807\u51c6)\n<ul>\n<li>\u7528\u9014&#xff1a;\u8fd9\u662f\u7cfb\u7edf\u7684\u201c\u5ea6\u91cf\u8861\u201d\u3002<\/li>\n<li>\u6df1\u5ea6\u89e3\u6790&#xff1a;\u5b83\u5b9a\u4e49\u4e86 Gemini \u7edf\u4e00\u8868\u793a\u7a7a\u95f4 \u7684\u7269\u7406\u6cd5\u5219\u3002\u6700\u5173\u952e\u7684\u53c2\u6570\u662f D_MODEL &#061; 2048&#xff08;\u6216\u6839\u636e\u6a21\u578b\u53d8\u52a8&#xff09;&#xff0c;\u8fd9\u5f3a\u5236\u8981\u6c42\u65e0\u8bba\u662f\u6587\u672c\u3001\u56fe\u50cf\u8fd8\u662f\u89c6\u9891&#xff0c;\u6700\u7ec8\u8f93\u51fa\u7684\u5411\u91cf\u5fc5\u987b\u662f\u8fd9\u4e2a\u7ef4\u5ea6\u3002\u540c\u65f6&#xff0c;\u5b83\u5b9a\u4e49\u4e86 MAX_SEQ_LEN&#xff0c;\u786e\u4fdd\u4e0d\u540c\u6a21\u6001\u5728\u5e8f\u5217\u5316\u540e\u80fd\u9002\u914d\u540c\u4e00\u4e2a Transformer \u7684\u4e0a\u4e0b\u6587\u7a97\u53e3\u3002<\/li>\n<\/ul>\n<\/li>\n<li>core\/model_setup.py (\u5171\u4eab\u9aa8\u67b6)\n<ul>\n<li>\u7528\u9014&#xff1a;\u52a0\u8f7d Gemini \u7684\u201c\u5355\u4e00\u6a21\u578b\u6743\u91cd\u201d\u3002<\/li>\n<li>\u6df1\u5ea6\u89e3\u6790&#xff1a;\u4f20\u7edf\u67b6\u6784\u53ef\u80fd\u9700\u8981\u5206\u522b\u52a0\u8f7d BERT&#xff08;\u5904\u7406\u6587\u672c&#xff09;\u548c ResNet&#xff08;\u5904\u7406\u56fe\u50cf&#xff09;\u3002\u800c\u6b64\u6587\u4ef6\u901a\u8fc7 AutoModel.from_pretrained \u52a0\u8f7d \u540c\u4e00\u4e2a Transformer \u9aa8\u5e72\u7f51\u7edc\u3002\u5b83\u786e\u4fdd\u6587\u672c\u548c\u89c6\u89c9\u4fe1\u53f7\u6d41\u5411\u540c\u4e00\u4e2a\u795e\u7ecf\u7f51\u7edc&#xff0c;\u8fd9\u662f\u5b9e\u73b0\u201c\u539f\u751f\u591a\u6a21\u6001\u201d\u7684\u7269\u7406\u57fa\u7840\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>2. \u7edf\u4e00\u8868\u793a\u5f15\u64ce (The Unification Engine)<\/p>\n<p>\u8fd9\u662f\u9879\u76ee\u4e2d\u6700\u6838\u5fc3\u7684\u90e8\u5206&#xff0c;\u8d1f\u8d23\u6267\u884c\u201c\u8f93\u5165\u5bf9\u9f50\u201d\u548c\u201c\u8f93\u51fa\u5bf9\u9f50\u201d\u3002<\/p>\n<ul>\n<li>core\/modal_processor.py (\u5e8f\u5217\u5316\u5de5\u5382 \/ Input Layer)\n<ul>\n<li>\u7528\u9014&#xff1a;\u6a21\u6001\u7684\u201c\u7c89\u788e\u673a\u201d\u4e0e\u201c\u5305\u88c5\u673a\u201d\u3002<\/li>\n<li>\u534f\u4f5c\u903b\u8f91&#xff1a;\n<ul>\n<li>\u6587\u672c&#xff1a;\u8c03\u7528 Tokenizer \u8fdb\u884c BPE \u7f16\u7801&#xff0c;\u751f\u6210 1D ID \u5e8f\u5217\u3002<\/li>\n<li>\u56fe\u50cf&#xff1a;\u8c03\u7528 ImageProcessor \u5c06\u56fe\u7247\u5207\u5206\u4e3a 16&#215;16 \u7684 Patch&#xff0c;\u62c9\u5e73\u6210\u5e8f\u5217\u3002<\/li>\n<li>\u89c6\u9891&#xff1a;\u8fd9\u662f\u5173\u952e\u3002\u5b83\u6267\u884c\u201c\u65f6\u7a7a\u91c7\u6837\u201d&#xff0c;\u5148\u6309\u65f6\u95f4\u62bd\u5e27&#xff0c;\u518d\u6309\u7a7a\u95f4\u5207\u5757&#xff0c;\u6700\u7ec8\u751f\u6210\u5e26\u6709\u65f6\u95f4\u6233\u4fe1\u606f\u7684 3D Token \u5e8f\u5217\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\u6838\u5fc3\u4ef7\u503c&#xff1a;\u5b83\u5411\u6a21\u578b\u5c4f\u853d\u4e86\u539f\u59cb\u6570\u636e\u7684\u683c\u5f0f\u5dee\u5f02&#xff0c;\u8ba9\u6a21\u578b\u53ea\u770b\u5230\u201cToken \u5e8f\u5217\u201d\u3002<\/li>\n<\/ul>\n<\/li>\n<li>core\/embedding.py (\u7edf\u4e00\u6620\u5c04 \/ Output Layer)\n<ul>\n<li>\u7528\u9014&#xff1a;\u5411\u91cf\u751f\u6210\u5668\u4e0e\u5f52\u4e00\u5316\u5668\u3002<\/li>\n<li>\u6df1\u5ea6\u89e3\u6790&#xff1a;\n<ul>\n<li>\u6295\u5f71 (Projection)&#xff1a;\u5982\u679c\u539f\u59cb\u6a21\u578b\u7684\u89c6\u89c9\u5934\u8f93\u51fa\u7ef4\u5ea6&#xff08;\u5982 1024&#xff09;\u4e0e\u6587\u672c\u5934&#xff08;\u5982 4096&#xff09;\u4e0d\u4e00\u81f4&#xff0c;\u6b64\u6a21\u5757\u4f1a\u901a\u8fc7\u7ebf\u6027\u5c42 (nn.Linear) \u5c06\u5b83\u4eec\u5f3a\u5236\u6295\u5f71\u5230 config.py \u5b9a\u4e49\u7684 D_MODEL \u7ef4\u5ea6\u3002<\/li>\n<li>\u5f52\u4e00\u5316 (Normalization)&#xff1a;\u6267\u884c L2 \u5f52\u4e00\u5316\u3002\u8fd9\u662f\u8de8\u6a21\u6001\u68c0\u7d22\u7cbe\u5ea6\u7684\u4fdd\u8bc1&#xff0c;\u5b83\u786e\u4fdd\u201c\u4e00\u53ea\u732b\u201d\u7684\u56fe\u7247\u5411\u91cf\u548c\u201ccat\u201d\u7684\u6587\u672c\u5411\u91cf\u843d\u5728\u540c\u4e00\u4e2a\u8d85\u7403\u9762\u4e0a&#xff0c;\u4f7f\u5176\u4f59\u5f26\u76f8\u4f3c\u5ea6\u53ef\u8ba1\u7b97\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>3. \u8bb0\u5fc6\u4e0e\u68c0\u7d22 (The Memory &amp; Retrieval)<\/p>\n<ul>\n<li>core\/retriever.py (\u6df7\u5408\u8bb0\u5fc6)\n<ul>\n<li>\u7528\u9014&#xff1a;\u8de8\u6a21\u6001\u68c0\u7d22\u5f15\u64ce\u3002<\/li>\n<li>\u6df1\u5ea6\u89e3\u6790&#xff1a;\u5b83\u4e0d\u533a\u5206\u6570\u636e\u6765\u6e90\u3002\u5728 ChromaDB \u4e2d&#xff0c;\u5b83\u5c06\u6587\u672c\u5411\u91cf\u3001\u56fe\u50cf\u5411\u91cf\u548c\u89c6\u9891\u5411\u91cf\u5b58\u50a8\u5728\u540c\u4e00\u4e2a\u96c6\u5408 (Collection) \u4e2d\u3002<\/li>\n<li>\u5de5\u4f5c\u6d41&#xff1a;\u5f53\u4f60\u8f93\u5165\u4e00\u6bb5\u6587\u5b57\u201c\u6d77\u8fb9\u65e5\u843d\u201d\u65f6&#xff0c;\u6b64\u6a21\u5757\u5c06\u5176\u8f6c\u5316\u4e3a\u5411\u91cf&#xff0c;\u5e76\u5728\u540c\u4e00\u4e2a\u6570\u5b66\u7a7a\u95f4\u4e2d\u540c\u65f6\u5bfb\u627e\u8ddd\u79bb\u6700\u8fd1\u7684\u56fe\u7247\u548c\u89c6\u9891\u3002\u8fd9\u8bc1\u660e\u4e86\u4e0d\u540c\u6a21\u6001\u7684\u6570\u636e\u5df2\u7ecf\u5728\u6570\u5b66\u4e0a\u5b9e\u73b0\u4e86\u201c\u7edf\u4e00\u201d\u3002<\/li>\n<\/ul>\n<\/li>\n<li>embeddings_db\/ (\u8bed\u4e49\u7a7a\u95f4)\n<ul>\n<li>\u7528\u9014&#xff1a;\u6301\u4e45\u5316\u5b58\u50a8\u3002<\/li>\n<li>\u6838\u5fc3\u7279\u70b9&#xff1a;\u8fd9\u91cc\u7684\u4e8c\u8fdb\u5236\u6587\u4ef6\u4e0d\u4ec5\u5b58\u50a8\u4e86\u5411\u91cf&#xff0c;\u8fd8\u5b58\u50a8\u4e86 modal_type (\u6a21\u6001\u7c7b\u578b) \u7b49\u5143\u6570\u636e&#xff0c;\u5141\u8bb8\u6211\u4eec\u5728\u7edf\u4e00\u641c\u7d22\u7684\u57fa\u7840\u4e0a\u8fdb\u884c\u7279\u5b9a\u6a21\u6001\u7684\u8fc7\u6ee4&#xff08;\u4f8b\u5982&#xff1a;\u53ea\u641c\u89c6\u9891&#xff09;\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h4>4.2 \u6838\u5fc3\u914d\u7f6e\u6587\u4ef6&#xff08;config.py&#xff09;<\/h4>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">import<\/span> os<br \/>\n<span class=\"token keyword\">from<\/span> pathlib <span class=\"token keyword\">import<\/span> Path<\/p>\n<p><span class=\"token comment\"># \u9879\u76ee\u8def\u5f84\u914d\u7f6e<\/span><br \/>\nPROJECT_ROOT <span class=\"token operator\">&#061;<\/span> Path<span class=\"token punctuation\">(<\/span>__file__<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>parent<br \/>\nDATA_DIR <span class=\"token operator\">&#061;<\/span> PROJECT_ROOT <span class=\"token operator\">\/<\/span> <span class=\"token string\">&#034;data&#034;<\/span><br \/>\nEMBEDDINGS_DB_DIR <span class=\"token operator\">&#061;<\/span> PROJECT_ROOT <span class=\"token operator\">\/<\/span> <span class=\"token string\">&#034;embeddings_db&#034;<\/span> <span class=\"token operator\">\/<\/span> <span class=\"token string\">&#034;chroma&#034;<\/span><br \/>\nLOGS_DIR <span class=\"token operator\">&#061;<\/span> PROJECT_ROOT <span class=\"token operator\">\/<\/span> <span class=\"token string\">&#034;logs&#034;<\/span><\/p>\n<p><span class=\"token comment\"># \u521b\u5efa\u5fc5\u8981\u76ee\u5f55<\/span><br \/>\n<span class=\"token keyword\">for<\/span> dir_path <span class=\"token keyword\">in<\/span> <span class=\"token punctuation\">[<\/span>DATA_DIR<span class=\"token punctuation\">,<\/span> EMBEDDINGS_DB_DIR<span class=\"token punctuation\">,<\/span> LOGS_DIR<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    dir_path<span class=\"token punctuation\">.<\/span>mkdir<span class=\"token punctuation\">(<\/span>parents<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span> exist_ok<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># \u6a21\u578b\u914d\u7f6e<\/span><br \/>\nMODEL_DIR <span class=\"token operator\">&#061;<\/span> Path<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\/data\/models\/gemini&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\nMODEL_NAME <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;google\/gemma-2-9b-it&#034;<\/span><br \/>\nDEVICE <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;cuda&#034;<\/span> <span class=\"token keyword\">if<\/span> torch<span class=\"token punctuation\">.<\/span>cuda<span class=\"token punctuation\">.<\/span>is_available<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span> <span class=\"token keyword\">else<\/span> <span class=\"token string\">&#034;cpu&#034;<\/span><br \/>\nD_MODEL <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">2048<\/span>  <span class=\"token comment\"># \u7edf\u4e00\u5d4c\u5165\u7ef4\u5ea6<\/span><br \/>\nMAX_SEQ_LEN <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">4096<\/span>  <span class=\"token comment\"># \u6700\u5927\u5e8f\u5217\u957f\u5ea6<\/span><\/p>\n<p><span class=\"token comment\"># \u591a\u6a21\u6001\u5904\u7406\u914d\u7f6e<\/span><br \/>\nVIDEO_FRAME_RATE <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">1<\/span>  <span class=\"token comment\"># \u89c6\u9891\u62bd\u5e27\u5e27\u7387<\/span><br \/>\nIMAGE_SIZE <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">(<\/span><span class=\"token number\">256<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">256<\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u56fe\u50cf\u5c3a\u5bf8<\/span><br \/>\nPATCH_SIZE <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">16<\/span>  <span class=\"token comment\"># \u89c6\u89c9Patch\u5c3a\u5bf8<\/span><br \/>\nNORMALIZE_EMBEDDINGS <span class=\"token operator\">&#061;<\/span> <span class=\"token boolean\">True<\/span>  <span class=\"token comment\"># \u662f\u5426\u5f52\u4e00\u5316\u5d4c\u5165\u5411\u91cf<\/span><\/p>\n<p><span class=\"token comment\"># \u5411\u91cf\u6570\u636e\u5e93\u914d\u7f6e<\/span><br \/>\nCHROMA_PERSIST_DIRECTORY <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">str<\/span><span class=\"token punctuation\">(<\/span>EMBEDDINGS_DB_DIR<span class=\"token punctuation\">)<\/span><br \/>\nCHROMA_COLLECTION_NAME <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;gemini_multimodal&#034;<\/span><br \/>\nRETRIEVE_TOP_K <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">5<\/span>  <span class=\"token comment\"># \u8de8\u6a21\u6001\u68c0\u7d22\u8fd4\u56de\u6570\u91cf<\/span><\/p>\n<p><span class=\"token comment\"># Gemini API\u914d\u7f6e&#xff08;\u5907\u7528&#xff0c;\u672c\u5730\u90e8\u7f72\u4f18\u5148&#xff09;<\/span><br \/>\nGEMINI_API_KEY <span class=\"token operator\">&#061;<\/span> os<span class=\"token punctuation\">.<\/span>getenv<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;GEMINI_API_KEY&#034;<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#034;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\nGEMINI_API_ENDPOINT <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;https:\/\/generativelanguage.googleapis.com\/v1\/models\/gemini-1.5-flash:generateContent&#034;<\/span><\/p>\n<h4>4.3 \u6a21\u578b\u521d\u59cb\u5316&#xff08;core\/model_setup.py&#xff09;<\/h4>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">import<\/span> torch<br \/>\n<span class=\"token keyword\">from<\/span> transformers <span class=\"token keyword\">import<\/span> AutoModel<span class=\"token punctuation\">,<\/span> AutoTokenizer<span class=\"token punctuation\">,<\/span> AutoImageProcessor<br \/>\n<span class=\"token keyword\">from<\/span> config <span class=\"token keyword\">import<\/span> MODEL_DIR<span class=\"token punctuation\">,<\/span> MODEL_NAME<span class=\"token punctuation\">,<\/span> DEVICE<span class=\"token punctuation\">,<\/span> D_MODEL<span class=\"token punctuation\">,<\/span> MAX_SEQ_LEN<\/p>\n<p><span class=\"token keyword\">def<\/span> <span class=\"token function\">setup_gemini_model<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u521d\u59cb\u5316Gemini\u591a\u6a21\u6001\u6a21\u578b&#034;&#034;&#034;<\/span><br \/>\n    <span class=\"token comment\"># 1. \u52a0\u8f7d\u6587\u672cTokenizer<\/span><br \/>\n    tokenizer <span class=\"token operator\">&#061;<\/span> AutoTokenizer<span class=\"token punctuation\">.<\/span>from_pretrained<span class=\"token punctuation\">(<\/span><br \/>\n        MODEL_NAME<span class=\"token punctuation\">,<\/span><br \/>\n        cache_dir<span class=\"token operator\">&#061;<\/span><span class=\"token builtin\">str<\/span><span class=\"token punctuation\">(<\/span>MODEL_DIR<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        padding_side<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;right&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        truncation_side<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;right&#034;<\/span><br \/>\n    <span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># 2. \u52a0\u8f7d\u89c6\u89c9\u5904\u7406\u5668&#xff08;\u56fe\u50cf\/\u89c6\u9891&#xff09;<\/span><br \/>\n    image_processor <span class=\"token operator\">&#061;<\/span> AutoImageProcessor<span class=\"token punctuation\">.<\/span>from_pretrained<span class=\"token punctuation\">(<\/span><br \/>\n        MODEL_NAME<span class=\"token punctuation\">,<\/span><br \/>\n        cache_dir<span class=\"token operator\">&#061;<\/span><span class=\"token builtin\">str<\/span><span class=\"token punctuation\">(<\/span>MODEL_DIR<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        image_size<span class=\"token operator\">&#061;<\/span>IMAGE_SIZE  <span class=\"token comment\"># \u5339\u914dconfig\u4e2d\u7684\u56fe\u50cf\u5c3a\u5bf8<\/span><br \/>\n    <span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># 3. \u52a0\u8f7d\u591a\u6a21\u6001\u6a21\u578b<\/span><br \/>\n    model <span class=\"token operator\">&#061;<\/span> AutoModel<span class=\"token punctuation\">.<\/span>from_pretrained<span class=\"token punctuation\">(<\/span><br \/>\n        MODEL_NAME<span class=\"token punctuation\">,<\/span><br \/>\n        cache_dir<span class=\"token operator\">&#061;<\/span><span class=\"token builtin\">str<\/span><span class=\"token punctuation\">(<\/span>MODEL_DIR<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        torch_dtype<span class=\"token operator\">&#061;<\/span>torch<span class=\"token punctuation\">.<\/span>float16 <span class=\"token keyword\">if<\/span> DEVICE <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;cuda&#034;<\/span> <span class=\"token keyword\">else<\/span> torch<span class=\"token punctuation\">.<\/span>float32<span class=\"token punctuation\">,<\/span><br \/>\n        device_map<span class=\"token operator\">&#061;<\/span>DEVICE<span class=\"token punctuation\">,<\/span><br \/>\n        trust_remote_code<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span>  <span class=\"token comment\"># \u52a0\u8f7d\u81ea\u5b9a\u4e49\u6a21\u578b\u4ee3\u7801<\/span><br \/>\n    <span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># 4. \u9002\u914d\u7edf\u4e00\u5d4c\u5165\u7ef4\u5ea6&#xff08;\u82e5\u6a21\u578b\u8f93\u51fa\u7ef4\u5ea6\u4e0d\u7b26&#xff09;<\/span><br \/>\n    <span class=\"token keyword\">if<\/span> model<span class=\"token punctuation\">.<\/span>config<span class=\"token punctuation\">.<\/span>hidden_size <span class=\"token operator\">!&#061;<\/span> D_MODEL<span class=\"token punctuation\">:<\/span><br \/>\n        model<span class=\"token punctuation\">.<\/span>embed_proj <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>nn<span class=\"token punctuation\">.<\/span>Linear<span class=\"token punctuation\">(<\/span>model<span class=\"token punctuation\">.<\/span>config<span class=\"token punctuation\">.<\/span>hidden_size<span class=\"token punctuation\">,<\/span> D_MODEL<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>to<span class=\"token punctuation\">(<\/span>DEVICE<span class=\"token punctuation\">)<\/span><\/p>\n<p>    model<span class=\"token punctuation\">.<\/span><span class=\"token builtin\">eval<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u63a8\u7406\u6a21\u5f0f<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;Gemini\u6a21\u578b\u521d\u59cb\u5316\u5b8c\u6210&#xff0c;\u8bbe\u5907&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>DEVICE<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">return<\/span> model<span class=\"token punctuation\">,<\/span> tokenizer<span class=\"token punctuation\">,<\/span> image_processor<\/p>\n<p><span class=\"token comment\"># \u5355\u4f8b\u6a21\u5f0f\u52a0\u8f7d\u6a21\u578b&#xff08;\u907f\u514d\u91cd\u590d\u521d\u59cb\u5316&#xff09;<\/span><br \/>\n_model<span class=\"token punctuation\">,<\/span> _tokenizer<span class=\"token punctuation\">,<\/span> _image_processor <span class=\"token operator\">&#061;<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token boolean\">None<\/span><br \/>\n<span class=\"token keyword\">def<\/span> <span class=\"token function\">get_gemini_model<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">global<\/span> _model<span class=\"token punctuation\">,<\/span> _tokenizer<span class=\"token punctuation\">,<\/span> _image_processor<br \/>\n    <span class=\"token keyword\">if<\/span> _model <span class=\"token keyword\">is<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        _model<span class=\"token punctuation\">,<\/span> _tokenizer<span class=\"token punctuation\">,<\/span> _image_processor <span class=\"token operator\">&#061;<\/span> setup_gemini_model<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">return<\/span> _model<span class=\"token punctuation\">,<\/span> _tokenizer<span class=\"token punctuation\">,<\/span> _image_processor<\/p>\n<h4>4.4 \u591a\u6a21\u6001\u9884\u5904\u7406&#xff08;core\/modal_processor.py&#xff09;<\/h4>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">import<\/span> cv2<br \/>\n<span class=\"token keyword\">import<\/span> numpy <span class=\"token keyword\">as<\/span> np<br \/>\n<span class=\"token keyword\">import<\/span> torch<br \/>\n<span class=\"token keyword\">from<\/span> PIL <span class=\"token keyword\">import<\/span> Image<br \/>\n<span class=\"token keyword\">from<\/span> config <span class=\"token keyword\">import<\/span> VIDEO_FRAME_RATE<span class=\"token punctuation\">,<\/span> IMAGE_SIZE<span class=\"token punctuation\">,<\/span> PATCH_SIZE<span class=\"token punctuation\">,<\/span> MAX_SEQ_LEN<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">ModalProcessor<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> tokenizer<span class=\"token punctuation\">,<\/span> image_processor<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>tokenizer <span class=\"token operator\">&#061;<\/span> tokenizer<br \/>\n        self<span class=\"token punctuation\">.<\/span>image_processor <span class=\"token operator\">&#061;<\/span> image_processor<\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">process_text<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> text<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u5904\u7406\u6587\u672c&#xff1a;Tokenize\u5e76\u622a\u65ad&#034;&#034;&#034;<\/span><br \/>\n        inputs <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>tokenizer<span class=\"token punctuation\">(<\/span><br \/>\n            text<span class=\"token punctuation\">,<\/span><br \/>\n            max_length<span class=\"token operator\">&#061;<\/span>MAX_SEQ_LEN<span class=\"token punctuation\">,<\/span><br \/>\n            padding<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;max_length&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            truncation<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            return_tensors<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;pt&#034;<\/span><br \/>\n        <span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> inputs<\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">process_image<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> image_path<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u5904\u7406\u56fe\u50cf&#xff1a;\u52a0\u8f7d\u2192Resize\u2192Patch\u5316&#034;&#034;&#034;<\/span><br \/>\n        <span class=\"token comment\"># \u52a0\u8f7d\u56fe\u50cf<\/span><br \/>\n        image <span class=\"token operator\">&#061;<\/span> Image<span class=\"token punctuation\">.<\/span><span class=\"token builtin\">open<\/span><span class=\"token punctuation\">(<\/span>image_path<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>convert<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;RGB&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u56fe\u50cf\u9884\u5904\u7406<\/span><br \/>\n        inputs <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>image_processor<span class=\"token punctuation\">(<\/span><br \/>\n            images<span class=\"token operator\">&#061;<\/span>image<span class=\"token punctuation\">,<\/span><br \/>\n            return_tensors<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;pt&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            do_resize<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            size<span class=\"token operator\">&#061;<\/span>IMAGE_SIZE<br \/>\n        <span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> inputs<\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">process_video<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> video_path<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u5904\u7406\u89c6\u9891&#xff1a;\u62bd\u5e27\u2192\u9010\u5e27\u5904\u7406\u2192\u62fc\u63a5&#034;&#034;&#034;<\/span><br \/>\n        <span class=\"token comment\"># 1. \u62bd\u5e27<\/span><br \/>\n        cap <span class=\"token operator\">&#061;<\/span> cv2<span class=\"token punctuation\">.<\/span>VideoCapture<span class=\"token punctuation\">(<\/span>video_path<span class=\"token punctuation\">)<\/span><br \/>\n        frames <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">]<\/span><br \/>\n        frame_idx <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">0<\/span><br \/>\n        fps <span class=\"token operator\">&#061;<\/span> cap<span class=\"token punctuation\">.<\/span>get<span class=\"token punctuation\">(<\/span>cv2<span class=\"token punctuation\">.<\/span>CAP_PROP_FPS<span class=\"token punctuation\">)<\/span><br \/>\n        sample_interval <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">int<\/span><span class=\"token punctuation\">(<\/span>fps <span class=\"token operator\">\/<\/span> VIDEO_FRAME_RATE<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token keyword\">while<\/span> cap<span class=\"token punctuation\">.<\/span>isOpened<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            ret<span class=\"token punctuation\">,<\/span> frame <span class=\"token operator\">&#061;<\/span> cap<span class=\"token punctuation\">.<\/span>read<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n            <span class=\"token keyword\">if<\/span> <span class=\"token keyword\">not<\/span> ret<span class=\"token punctuation\">:<\/span><br \/>\n                <span class=\"token keyword\">break<\/span><br \/>\n            <span class=\"token keyword\">if<\/span> frame_idx <span class=\"token operator\">%<\/span> sample_interval <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token number\">0<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                <span class=\"token comment\"># \u8f6c\u6362\u4e3aRGB\u5e76Resize<\/span><br \/>\n                frame <span class=\"token operator\">&#061;<\/span> cv2<span class=\"token punctuation\">.<\/span>cvtColor<span class=\"token punctuation\">(<\/span>frame<span class=\"token punctuation\">,<\/span> cv2<span class=\"token punctuation\">.<\/span>COLOR_BGR2RGB<span class=\"token punctuation\">)<\/span><br \/>\n                frame <span class=\"token operator\">&#061;<\/span> cv2<span class=\"token punctuation\">.<\/span>resize<span class=\"token punctuation\">(<\/span>frame<span class=\"token punctuation\">,<\/span> IMAGE_SIZE<span class=\"token punctuation\">)<\/span><br \/>\n                frames<span class=\"token punctuation\">.<\/span>append<span class=\"token punctuation\">(<\/span>Image<span class=\"token punctuation\">.<\/span>fromarray<span class=\"token punctuation\">(<\/span>frame<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><br \/>\n            frame_idx <span class=\"token operator\">&#043;&#061;<\/span> <span class=\"token number\">1<\/span><br \/>\n        cap<span class=\"token punctuation\">.<\/span>release<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 2. \u9884\u5904\u7406\u5e27&#xff08;\u9650\u5236\u6700\u5927\u5e27\u6570&#xff09;<\/span><br \/>\n        max_frames <span class=\"token operator\">&#061;<\/span> MAX_SEQ_LEN <span class=\"token operator\">\/\/<\/span> <span class=\"token punctuation\">(<\/span>IMAGE_SIZE<span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">\/\/<\/span> PATCH_SIZE<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">**<\/span><span class=\"token number\">2<\/span><br \/>\n        frames <span class=\"token operator\">&#061;<\/span> frames<span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">:<\/span>max_frames<span class=\"token punctuation\">]<\/span><\/p>\n<p>        <span class=\"token comment\"># 3. \u89c6\u9891\u5e27\u9884\u5904\u7406<\/span><br \/>\n        inputs <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>image_processor<span class=\"token punctuation\">(<\/span><br \/>\n            images<span class=\"token operator\">&#061;<\/span>frames<span class=\"token punctuation\">,<\/span><br \/>\n            return_tensors<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;pt&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            do_resize<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            size<span class=\"token operator\">&#061;<\/span>IMAGE_SIZE<br \/>\n        <span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u6dfb\u52a0\u65f6\u95f4\u7ef4\u5ea6\u6807\u8bb0<\/span><br \/>\n        inputs<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;time_ids&#034;<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>arange<span class=\"token punctuation\">(<\/span><span class=\"token builtin\">len<\/span><span class=\"token punctuation\">(<\/span>frames<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>unsqueeze<span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> inputs<\/p>\n<p><span class=\"token comment\"># \u6d4b\u8bd5\u5904\u7406\u5668<\/span><br \/>\n<span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;__main__&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">from<\/span> core<span class=\"token punctuation\">.<\/span>model_setup <span class=\"token keyword\">import<\/span> get_gemini_model<br \/>\n    _<span class=\"token punctuation\">,<\/span> tokenizer<span class=\"token punctuation\">,<\/span> image_processor <span class=\"token operator\">&#061;<\/span> get_gemini_model<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    processor <span class=\"token operator\">&#061;<\/span> ModalProcessor<span class=\"token punctuation\">(<\/span>tokenizer<span class=\"token punctuation\">,<\/span> image_processor<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u6d4b\u8bd5\u6587\u672c\u5904\u7406<\/span><br \/>\n    text_inputs <span class=\"token operator\">&#061;<\/span> processor<span class=\"token punctuation\">.<\/span>process_text<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u7ea2\u8272\u8dd1\u8f66\u5728\u516c\u8def\u4e0a\u884c\u9a76&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u6587\u672cToken\u5f62\u72b6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>text_inputs<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;input_ids&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u6d4b\u8bd5\u56fe\u50cf\u5904\u7406<\/span><br \/>\n    image_inputs <span class=\"token operator\">&#061;<\/span> processor<span class=\"token punctuation\">.<\/span>process_image<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;data\/images\/car.jpg&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u56fe\u50cfPixel\u503c\u5f62\u72b6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>image_inputs<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;pixel_values&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u6d4b\u8bd5\u89c6\u9891\u5904\u7406<\/span><br \/>\n    video_inputs <span class=\"token operator\">&#061;<\/span> processor<span class=\"token punctuation\">.<\/span>process_video<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;data\/videos\/car_driving.mp4&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u89c6\u9891Pixel\u503c\u5f62\u72b6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>video_inputs<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;pixel_values&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u89c6\u9891\u65f6\u95f4ID\u5f62\u72b6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>video_inputs<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;time_ids&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<h4>4.5 \u751f\u6210\u7edf\u4e00\u5d4c\u5165&#xff08;core\/embedding.py&#xff09;<\/h4>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">import<\/span> torch<br \/>\n<span class=\"token keyword\">from<\/span> config <span class=\"token keyword\">import<\/span> DEVICE<span class=\"token punctuation\">,<\/span> D_MODEL<span class=\"token punctuation\">,<\/span> NORMALIZE_EMBEDDINGS<br \/>\n<span class=\"token keyword\">from<\/span> core<span class=\"token punctuation\">.<\/span>modal_processor <span class=\"token keyword\">import<\/span> ModalProcessor<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">MultimodalEmbedding<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token keyword\">from<\/span> core<span class=\"token punctuation\">.<\/span>model_setup <span class=\"token keyword\">import<\/span> get_gemini_model<br \/>\n        self<span class=\"token punctuation\">.<\/span>model<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>tokenizer<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>image_processor <span class=\"token operator\">&#061;<\/span> get_gemini_model<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>processor <span class=\"token operator\">&#061;<\/span> ModalProcessor<span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">.<\/span>tokenizer<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>image_processor<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token decorator annotation punctuation\">&#064;torch<span class=\"token punctuation\">.<\/span>no_grad<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">get_text_embedding<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> text<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u751f\u6210\u6587\u672c\u7684\u7edf\u4e00\u5d4c\u5165&#034;&#034;&#034;<\/span><br \/>\n        inputs <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>processor<span class=\"token punctuation\">.<\/span>process_text<span class=\"token punctuation\">(<\/span>text<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>to<span class=\"token punctuation\">(<\/span>DEVICE<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u524d\u5411\u4f20\u64ad\u83b7\u53d6\u9690\u85cf\u72b6\u6001<\/span><br \/>\n        outputs <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>model<span class=\"token punctuation\">(<\/span><span class=\"token operator\">**<\/span>inputs<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u53d6&lt;CLS&gt; token\u7684\u5d4c\u5165&#xff08;\u6216\u5747\u503c\u6c60\u5316&#xff09;<\/span><br \/>\n        embedding <span class=\"token operator\">&#061;<\/span> outputs<span class=\"token punctuation\">.<\/span>last_hidden_state<span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">:<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token punctuation\">:<\/span><span class=\"token punctuation\">]<\/span><br \/>\n        <span class=\"token comment\"># \u9002\u914d\u7edf\u4e00\u7ef4\u5ea6<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> <span class=\"token builtin\">hasattr<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">.<\/span>model<span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#034;embed_proj&#034;<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            embedding <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>model<span class=\"token punctuation\">.<\/span>embed_proj<span class=\"token punctuation\">(<\/span>embedding<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u5f52\u4e00\u5316<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> NORMALIZE_EMBEDDINGS<span class=\"token punctuation\">:<\/span><br \/>\n            embedding <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>nn<span class=\"token punctuation\">.<\/span>functional<span class=\"token punctuation\">.<\/span>normalize<span class=\"token punctuation\">(<\/span>embedding<span class=\"token punctuation\">,<\/span> p<span class=\"token operator\">&#061;<\/span><span class=\"token number\">2<\/span><span class=\"token punctuation\">,<\/span> dim<span class=\"token operator\">&#061;<\/span><span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> embedding<span class=\"token punctuation\">.<\/span>cpu<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>numpy<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><\/p>\n<p>    <span class=\"token decorator annotation punctuation\">&#064;torch<span class=\"token punctuation\">.<\/span>no_grad<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">get_image_embedding<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> image_path<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u751f\u6210\u56fe\u50cf\u7684\u7edf\u4e00\u5d4c\u5165&#034;&#034;&#034;<\/span><br \/>\n        inputs <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>processor<span class=\"token punctuation\">.<\/span>process_image<span class=\"token punctuation\">(<\/span>image_path<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>to<span class=\"token punctuation\">(<\/span>DEVICE<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u89c6\u89c9\u524d\u5411\u4f20\u64ad<\/span><br \/>\n        outputs <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>model<span class=\"token punctuation\">.<\/span>visual_model<span class=\"token punctuation\">(<\/span><span class=\"token operator\">**<\/span>inputs<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u5747\u503c\u6c60\u5316\u83b7\u53d6\u56fe\u50cf\u5d4c\u5165<\/span><br \/>\n        embedding <span class=\"token operator\">&#061;<\/span> outputs<span class=\"token punctuation\">.<\/span>last_hidden_state<span class=\"token punctuation\">.<\/span>mean<span class=\"token punctuation\">(<\/span>dim<span class=\"token operator\">&#061;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u9002\u914d\u7edf\u4e00\u7ef4\u5ea6<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> <span class=\"token builtin\">hasattr<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">.<\/span>model<span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#034;embed_proj&#034;<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            embedding <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>model<span class=\"token punctuation\">.<\/span>embed_proj<span class=\"token punctuation\">(<\/span>embedding<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u5f52\u4e00\u5316<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> NORMALIZE_EMBEDDINGS<span class=\"token punctuation\">:<\/span><br \/>\n            embedding <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>nn<span class=\"token punctuation\">.<\/span>functional<span class=\"token punctuation\">.<\/span>normalize<span class=\"token punctuation\">(<\/span>embedding<span class=\"token punctuation\">,<\/span> p<span class=\"token operator\">&#061;<\/span><span class=\"token number\">2<\/span><span class=\"token punctuation\">,<\/span> dim<span class=\"token operator\">&#061;<\/span><span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> embedding<span class=\"token punctuation\">.<\/span>cpu<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>numpy<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><\/p>\n<p>    <span class=\"token decorator annotation punctuation\">&#064;torch<span class=\"token punctuation\">.<\/span>no_grad<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">get_video_embedding<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> video_path<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u751f\u6210\u89c6\u9891\u7684\u7edf\u4e00\u5d4c\u5165&#034;&#034;&#034;<\/span><br \/>\n        inputs <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>processor<span class=\"token punctuation\">.<\/span>process_video<span class=\"token punctuation\">(<\/span>video_path<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>to<span class=\"token punctuation\">(<\/span>DEVICE<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u89c6\u9891\u524d\u5411\u4f20\u64ad&#xff08;\u65f6\u7a7a\u7f16\u7801&#xff09;<\/span><br \/>\n        visual_outputs <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>model<span class=\"token punctuation\">.<\/span>visual_model<span class=\"token punctuation\">(<\/span><span class=\"token operator\">**<\/span>inputs<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u65f6\u7a7a\u5747\u503c\u6c60\u5316&#xff08;\u65f6\u95f4&#043;\u7a7a\u95f4\u7ef4\u5ea6&#xff09;<\/span><br \/>\n        embedding <span class=\"token operator\">&#061;<\/span> visual_outputs<span class=\"token punctuation\">.<\/span>last_hidden_state<span class=\"token punctuation\">.<\/span>mean<span class=\"token punctuation\">(<\/span>dim<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">[<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u9002\u914d\u7edf\u4e00\u7ef4\u5ea6<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> <span class=\"token builtin\">hasattr<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">.<\/span>model<span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#034;embed_proj&#034;<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            embedding <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>model<span class=\"token punctuation\">.<\/span>embed_proj<span class=\"token punctuation\">(<\/span>embedding<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u5f52\u4e00\u5316<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> NORMALIZE_EMBEDDINGS<span class=\"token punctuation\">:<\/span><br \/>\n            embedding <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>nn<span class=\"token punctuation\">.<\/span>functional<span class=\"token punctuation\">.<\/span>normalize<span class=\"token punctuation\">(<\/span>embedding<span class=\"token punctuation\">,<\/span> p<span class=\"token operator\">&#061;<\/span><span class=\"token number\">2<\/span><span class=\"token punctuation\">,<\/span> dim<span class=\"token operator\">&#061;<\/span><span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> embedding<span class=\"token punctuation\">.<\/span>cpu<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>numpy<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><\/p>\n<p><span class=\"token comment\"># \u6d4b\u8bd5\u5d4c\u5165\u751f\u6210<\/span><br \/>\n<span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;__main__&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    embedder <span class=\"token operator\">&#061;<\/span> MultimodalEmbedding<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u751f\u6210\u6587\u672c\u5d4c\u5165<\/span><br \/>\n    text_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_text_embedding<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u7ea2\u8272\u8dd1\u8f66\u5728\u516c\u8def\u4e0a\u884c\u9a76&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u6587\u672c\u5d4c\u5165\u7ef4\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>text_emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u5e94\u4e3a (2048,)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u751f\u6210\u56fe\u50cf\u5d4c\u5165<\/span><br \/>\n    image_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_image_embedding<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;data\/images\/car.jpg&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u56fe\u50cf\u5d4c\u5165\u7ef4\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>image_emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u5e94\u4e3a (2048,)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u751f\u6210\u89c6\u9891\u5d4c\u5165<\/span><br \/>\n    video_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_video_embedding<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;data\/videos\/car_driving.mp4&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u89c6\u9891\u5d4c\u5165\u7ef4\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>video_emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u5e94\u4e3a (2048,)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u8ba1\u7b97\u8de8\u6a21\u6001\u76f8\u4f3c\u5ea6<\/span><br \/>\n    text_image_sim <span class=\"token operator\">&#061;<\/span> np<span class=\"token punctuation\">.<\/span>dot<span class=\"token punctuation\">(<\/span>text_emb<span class=\"token punctuation\">,<\/span> image_emb<span class=\"token punctuation\">)<\/span><br \/>\n    text_video_sim <span class=\"token operator\">&#061;<\/span> np<span class=\"token punctuation\">.<\/span>dot<span class=\"token punctuation\">(<\/span>text_emb<span class=\"token punctuation\">,<\/span> video_emb<span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u6587\u672c-\u56fe\u50cf\u76f8\u4f3c\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>text_image_sim<span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u6587\u672c-\u89c6\u9891\u76f8\u4f3c\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>text_video_sim<span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<h4>4.6 \u8de8\u6a21\u6001\u68c0\u7d22&#xff08;core\/retriever.py&#xff09;<\/h4>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">import<\/span> chromadb<br \/>\n<span class=\"token keyword\">import<\/span> numpy <span class=\"token keyword\">as<\/span> np<br \/>\n<span class=\"token keyword\">from<\/span> chromadb<span class=\"token punctuation\">.<\/span>config <span class=\"token keyword\">import<\/span> Settings<br \/>\n<span class=\"token keyword\">from<\/span> config <span class=\"token keyword\">import<\/span> CHROMA_PERSIST_DIRECTORY<span class=\"token punctuation\">,<\/span> CHROMA_COLLECTION_NAME<span class=\"token punctuation\">,<\/span> RETRIEVE_TOP_K<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">MultimodalRetriever<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token comment\"># \u521d\u59cb\u5316Chroma\u5411\u91cf\u6570\u636e\u5e93<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>client <span class=\"token operator\">&#061;<\/span> chromadb<span class=\"token punctuation\">.<\/span>Client<span class=\"token punctuation\">(<\/span>Settings<span class=\"token punctuation\">(<\/span><br \/>\n            persist_directory<span class=\"token operator\">&#061;<\/span>CHROMA_PERSIST_DIRECTORY<span class=\"token punctuation\">,<\/span><br \/>\n            anonymized_telemetry<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">False<\/span><br \/>\n        <span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u83b7\u53d6\u6216\u521b\u5efa\u591a\u6a21\u6001\u96c6\u5408<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>collection <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>client<span class=\"token punctuation\">.<\/span>get_or_create_collection<span class=\"token punctuation\">(<\/span><br \/>\n            name<span class=\"token operator\">&#061;<\/span>CHROMA_COLLECTION_NAME<span class=\"token punctuation\">,<\/span><br \/>\n            metadata<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">{<\/span><span class=\"token string\">&#034;description&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token string\">&#034;Gemini\u591a\u6a21\u6001\u7edf\u4e00\u5d4c\u5165\u96c6\u5408&#034;<\/span><span class=\"token punctuation\">}<\/span><br \/>\n        <span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">add_embedding<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> <span class=\"token builtin\">id<\/span><span class=\"token punctuation\">,<\/span> embedding<span class=\"token punctuation\">,<\/span> modal_type<span class=\"token punctuation\">,<\/span> metadata<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">None<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u6dfb\u52a0\u5d4c\u5165\u5230\u5411\u91cf\u5e93&#034;&#034;&#034;<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>collection<span class=\"token punctuation\">.<\/span>add<span class=\"token punctuation\">(<\/span><br \/>\n            ids<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">[<\/span><span class=\"token builtin\">id<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            embeddings<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">[<\/span>embedding<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            metadatas<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">{<\/span><span class=\"token string\">&#034;modal_type&#034;<\/span><span class=\"token punctuation\">:<\/span> modal_type<span class=\"token punctuation\">,<\/span> <span class=\"token operator\">**<\/span><span class=\"token punctuation\">(<\/span>metadata <span class=\"token keyword\">or<\/span> <span class=\"token punctuation\">{<\/span><span class=\"token punctuation\">}<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">}<\/span><span class=\"token punctuation\">]<\/span><br \/>\n        <span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>client<span class=\"token punctuation\">.<\/span>persist<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">retrieve<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> query_embedding<span class=\"token punctuation\">,<\/span> filter_modal_type<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">None<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u8de8\u6a21\u6001\u68c0\u7d22\u76f8\u4f3c\u5185\u5bb9&#034;&#034;&#034;<\/span><br \/>\n        <span class=\"token comment\"># \u6784\u5efa\u8fc7\u6ee4\u6761\u4ef6<\/span><br \/>\n        where_clause <span class=\"token operator\">&#061;<\/span> <span class=\"token boolean\">None<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> filter_modal_type<span class=\"token punctuation\">:<\/span><br \/>\n            where_clause <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">{<\/span><span class=\"token string\">&#034;modal_type&#034;<\/span><span class=\"token punctuation\">:<\/span> filter_modal_type<span class=\"token punctuation\">}<\/span><\/p>\n<p>        <span class=\"token comment\"># \u68c0\u7d22<\/span><br \/>\n        results <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>collection<span class=\"token punctuation\">.<\/span>query<span class=\"token punctuation\">(<\/span><br \/>\n            query_embeddings<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">[<\/span>query_embedding<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            n_results<span class=\"token operator\">&#061;<\/span>RETRIEVE_TOP_K<span class=\"token punctuation\">,<\/span><br \/>\n            where<span class=\"token operator\">&#061;<\/span>where_clause<br \/>\n        <span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u683c\u5f0f\u5316\u7ed3\u679c<\/span><br \/>\n        formatted_results <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">]<\/span><br \/>\n        <span class=\"token keyword\">for<\/span> i <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">range<\/span><span class=\"token punctuation\">(<\/span><span class=\"token builtin\">len<\/span><span class=\"token punctuation\">(<\/span>results<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;ids&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            formatted_results<span class=\"token punctuation\">.<\/span>append<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">{<\/span><br \/>\n                <span class=\"token string\">&#034;id&#034;<\/span><span class=\"token punctuation\">:<\/span> results<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;ids&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">[<\/span>i<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n                <span class=\"token string\">&#034;score&#034;<\/span><span class=\"token punctuation\">:<\/span> results<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;distances&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">[<\/span>i<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n                <span class=\"token string\">&#034;modal_type&#034;<\/span><span class=\"token punctuation\">:<\/span> results<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;metadatas&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">[<\/span>i<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;modal_type&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n                <span class=\"token string\">&#034;metadata&#034;<\/span><span class=\"token punctuation\">:<\/span> results<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;metadatas&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">[<\/span>i<span class=\"token punctuation\">]<\/span><br \/>\n            <span class=\"token punctuation\">}<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> formatted_results<\/p>\n<p><span class=\"token comment\"># \u6d4b\u8bd5\u8de8\u6a21\u6001\u68c0\u7d22<\/span><br \/>\n<span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;__main__&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">from<\/span> core<span class=\"token punctuation\">.<\/span>embedding <span class=\"token keyword\">import<\/span> MultimodalEmbedding<\/p>\n<p>    <span class=\"token comment\"># \u521d\u59cb\u5316\u5d4c\u5165\u5668\u548c\u68c0\u7d22\u5668<\/span><br \/>\n    embedder <span class=\"token operator\">&#061;<\/span> MultimodalEmbedding<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    retriever <span class=\"token operator\">&#061;<\/span> MultimodalRetriever<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># 1. \u6dfb\u52a0\u6d4b\u8bd5\u6570\u636e\u5230\u5411\u91cf\u5e93<\/span><br \/>\n    <span class=\"token comment\"># \u6587\u672c\u5d4c\u5165<\/span><br \/>\n    text_id <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;text_1&#034;<\/span><br \/>\n    text <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;\u7ea2\u8272\u8dd1\u8f66\u5728\u516c\u8def\u4e0a\u884c\u9a76&#034;<\/span><br \/>\n    text_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_text_embedding<span class=\"token punctuation\">(<\/span>text<span class=\"token punctuation\">)<\/span><br \/>\n    retriever<span class=\"token punctuation\">.<\/span>add_embedding<span class=\"token punctuation\">(<\/span><br \/>\n        <span class=\"token builtin\">id<\/span><span class=\"token operator\">&#061;<\/span>text_id<span class=\"token punctuation\">,<\/span><br \/>\n        embedding<span class=\"token operator\">&#061;<\/span>text_emb<span class=\"token punctuation\">,<\/span><br \/>\n        modal_type<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;text&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        metadata<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">{<\/span><span class=\"token string\">&#034;content&#034;<\/span><span class=\"token punctuation\">:<\/span> text<span class=\"token punctuation\">}<\/span><br \/>\n    <span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u56fe\u50cf\u5d4c\u5165<\/span><br \/>\n    image_id <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;image_1&#034;<\/span><br \/>\n    image_path <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;data\/images\/car.jpg&#034;<\/span><br \/>\n    image_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_image_embedding<span class=\"token punctuation\">(<\/span>image_path<span class=\"token punctuation\">)<\/span><br \/>\n    retriever<span class=\"token punctuation\">.<\/span>add_embedding<span class=\"token punctuation\">(<\/span><br \/>\n        <span class=\"token builtin\">id<\/span><span class=\"token operator\">&#061;<\/span>image_id<span class=\"token punctuation\">,<\/span><br \/>\n        embedding<span class=\"token operator\">&#061;<\/span>image_emb<span class=\"token punctuation\">,<\/span><br \/>\n        modal_type<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;image&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        metadata<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">{<\/span><span class=\"token string\">&#034;path&#034;<\/span><span class=\"token punctuation\">:<\/span> image_path<span class=\"token punctuation\">}<\/span><br \/>\n    <span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u89c6\u9891\u5d4c\u5165<\/span><br \/>\n    video_id <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;video_1&#034;<\/span><br \/>\n    video_path <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;data\/videos\/car_driving.mp4&#034;<\/span><br \/>\n    video_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_video_embedding<span class=\"token punctuation\">(<\/span>video_path<span class=\"token punctuation\">)<\/span><br \/>\n    retriever<span class=\"token punctuation\">.<\/span>add_embedding<span class=\"token punctuation\">(<\/span><br \/>\n        <span class=\"token builtin\">id<\/span><span class=\"token operator\">&#061;<\/span>video_id<span class=\"token punctuation\">,<\/span><br \/>\n        embedding<span class=\"token operator\">&#061;<\/span>video_emb<span class=\"token punctuation\">,<\/span><br \/>\n        modal_type<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;video&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        metadata<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">{<\/span><span class=\"token string\">&#034;path&#034;<\/span><span class=\"token punctuation\">:<\/span> video_path<span class=\"token punctuation\">}<\/span><br \/>\n    <span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># 2. \u8de8\u6a21\u6001\u68c0\u7d22&#xff08;\u6587\u672c\u67e5\u8be2\u2192\u627e\u76f8\u4f3c\u56fe\u50cf\/\u89c6\u9891&#xff09;<\/span><br \/>\n    query_text <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;\u7ea2\u8272\u8dd1\u8f66\u5728\u8def\u4e0a\u5f00&#034;<\/span><br \/>\n    query_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_text_embedding<span class=\"token punctuation\">(<\/span>query_text<span class=\"token punctuation\">)<\/span><br \/>\n    results <span class=\"token operator\">&#061;<\/span> retriever<span class=\"token punctuation\">.<\/span>retrieve<span class=\"token punctuation\">(<\/span>query_emb<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># 3. \u6253\u5370\u68c0\u7d22\u7ed3\u679c<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u67e5\u8be2\u6587\u672c&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>query_text<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u8de8\u6a21\u6001\u68c0\u7d22\u7ed3\u679c&#xff1a;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">for<\/span> res <span class=\"token keyword\">in<\/span> results<span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;- ID: <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>res<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;id&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">, \u7c7b\u578b: <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>res<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;modal_type&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">, \u76f8\u4f3c\u5ea6: <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span><span class=\"token number\">1<\/span><span class=\"token operator\">&#8211;<\/span>res<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;score&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<h4>4.7 \u4e3b\u7a0b\u5e8f&#xff08;main.py&#xff09;<\/h4>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">import<\/span> logging<br \/>\n<span class=\"token keyword\">from<\/span> loguru <span class=\"token keyword\">import<\/span> logger<br \/>\n<span class=\"token keyword\">from<\/span> config <span class=\"token keyword\">import<\/span> LOGS_DIR<br \/>\n<span class=\"token keyword\">from<\/span> core<span class=\"token punctuation\">.<\/span>embedding <span class=\"token keyword\">import<\/span> MultimodalEmbedding<br \/>\n<span class=\"token keyword\">from<\/span> core<span class=\"token punctuation\">.<\/span>retriever <span class=\"token keyword\">import<\/span> MultimodalRetriever<\/p>\n<p><span class=\"token comment\"># \u914d\u7f6e\u65e5\u5fd7<\/span><br \/>\nlogger<span class=\"token punctuation\">.<\/span>add<span class=\"token punctuation\">(<\/span><br \/>\n    LOGS_DIR <span class=\"token operator\">\/<\/span> <span class=\"token string\">&#034;multimodal.log&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    rotation<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;100MB&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    retention<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;7 days&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    encoding<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;utf-8&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    level<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;INFO&#034;<\/span><br \/>\n<span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token keyword\">def<\/span> <span class=\"token function\">main<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    logger<span class=\"token punctuation\">.<\/span>info<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u542f\u52a8Gemini\u591a\u6a21\u6001\u7edf\u4e00\u8868\u793a\u7cfb\u7edf&#034;<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u521d\u59cb\u5316\u7ec4\u4ef6<\/span><br \/>\n    embedder <span class=\"token operator\">&#061;<\/span> MultimodalEmbedding<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    retriever <span class=\"token operator\">&#061;<\/span> MultimodalRetriever<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u4ea4\u4e92\u83dc\u5355<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;&#061;&#061;&#061; Gemini\u591a\u6a21\u6001\u7edf\u4e00\u8868\u793a\u7cfb\u7edf &#061;&#061;&#061;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;1. \u751f\u6210\u6587\u672c\u5d4c\u5165&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;2. \u751f\u6210\u56fe\u50cf\u5d4c\u5165&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;3. \u751f\u6210\u89c6\u9891\u5d4c\u5165&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;4. \u8de8\u6a21\u6001\u68c0\u7d22&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;5. \u9000\u51fa&#034;<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">while<\/span> <span class=\"token boolean\">True<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        choice <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">input<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\\\\n\u8bf7\u9009\u62e9\u529f\u80fd&#xff08;1-5&#xff09;&#xff1a;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">try<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            <span class=\"token keyword\">if<\/span> choice <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;1&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                text <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">input<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u8bf7\u8f93\u5165\u6587\u672c&#xff1a;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n                emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_text_embedding<span class=\"token punctuation\">(<\/span>text<span class=\"token punctuation\">)<\/span><br \/>\n                logger<span class=\"token punctuation\">.<\/span>info<span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u751f\u6210\u6587\u672c\u5d4c\u5165&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>text<span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">20]<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#8230; \u7ef4\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u6587\u672c\u5d4c\u5165\u751f\u6210\u6210\u529f&#xff0c;\u7ef4\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>            <span class=\"token keyword\">elif<\/span> choice <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;2&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                image_path <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">input<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u8bf7\u8f93\u5165\u56fe\u50cf\u8def\u5f84&#xff1a;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n                emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_image_embedding<span class=\"token punctuation\">(<\/span>image_path<span class=\"token punctuation\">)<\/span><br \/>\n                logger<span class=\"token punctuation\">.<\/span>info<span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u751f\u6210\u56fe\u50cf\u5d4c\u5165&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>image_path<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\"> \u7ef4\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u56fe\u50cf\u5d4c\u5165\u751f\u6210\u6210\u529f&#xff0c;\u7ef4\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>            <span class=\"token keyword\">elif<\/span> choice <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;3&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                video_path <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">input<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u8bf7\u8f93\u5165\u89c6\u9891\u8def\u5f84&#xff1a;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n                emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_video_embedding<span class=\"token punctuation\">(<\/span>video_path<span class=\"token punctuation\">)<\/span><br \/>\n                logger<span class=\"token punctuation\">.<\/span>info<span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u751f\u6210\u89c6\u9891\u5d4c\u5165&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>video_path<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\"> \u7ef4\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u89c6\u9891\u5d4c\u5165\u751f\u6210\u6210\u529f&#xff0c;\u7ef4\u5ea6&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>            <span class=\"token keyword\">elif<\/span> choice <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;4&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                query_type <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">input<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u8bf7\u9009\u62e9\u67e5\u8be2\u7c7b\u578b&#xff08;text\/image\/video&#xff09;&#xff1a;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token keyword\">if<\/span> query_type <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;text&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                    query <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">input<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u8bf7\u8f93\u5165\u67e5\u8be2\u6587\u672c&#xff1a;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n                    query_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_text_embedding<span class=\"token punctuation\">(<\/span>query<span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token keyword\">elif<\/span> query_type <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;image&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                    query <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">input<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u8bf7\u8f93\u5165\u67e5\u8be2\u56fe\u50cf\u8def\u5f84&#xff1a;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n                    query_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_image_embedding<span class=\"token punctuation\">(<\/span>query<span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token keyword\">elif<\/span> query_type <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;video&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                    query <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">input<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u8bf7\u8f93\u5165\u67e5\u8be2\u89c6\u9891\u8def\u5f84&#xff1a;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n                    query_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_video_embedding<span class=\"token punctuation\">(<\/span>query<span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token keyword\">else<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u65e0\u6548\u7684\u67e5\u8be2\u7c7b\u578b&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n                    <span class=\"token keyword\">continue<\/span><\/p>\n<p>                <span class=\"token comment\"># \u68c0\u7d22<\/span><br \/>\n                results <span class=\"token operator\">&#061;<\/span> retriever<span class=\"token punctuation\">.<\/span>retrieve<span class=\"token punctuation\">(<\/span>query_emb<span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\\\\n\u68c0\u7d22\u7ed3\u679c&#xff08;\u6309\u76f8\u4f3c\u5ea6\u6392\u5e8f&#xff09;&#xff1a;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token keyword\">for<\/span> i<span class=\"token punctuation\">,<\/span> res <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">enumerate<\/span><span class=\"token punctuation\">(<\/span>results<span class=\"token punctuation\">,<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                    similarity <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">1<\/span> <span class=\"token operator\">&#8211;<\/span> res<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;score&#034;<\/span><span class=\"token punctuation\">]<\/span>  <span class=\"token comment\"># Chroma\u8fd4\u56de\u7684\u662f\u8ddd\u79bb&#xff0c;\u8f6c\u6362\u4e3a\u76f8\u4f3c\u5ea6<\/span><br \/>\n                    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>i<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">. ID: <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>res<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;id&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">, \u7c7b\u578b: <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>res<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;modal_type&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">, \u76f8\u4f3c\u5ea6: <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>similarity<span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>            <span class=\"token keyword\">elif<\/span> choice <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;5&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                logger<span class=\"token punctuation\">.<\/span>info<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u9000\u51fa\u7cfb\u7edf&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u518d\u89c1&#xff01;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n                <span class=\"token keyword\">break<\/span><\/p>\n<p>            <span class=\"token keyword\">else<\/span><span class=\"token punctuation\">:<\/span><br \/>\n                <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u65e0\u6548\u7684\u9009\u62e9&#xff0c;\u8bf7\u8f93\u51651-5&#034;<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token keyword\">except<\/span> Exception <span class=\"token keyword\">as<\/span> e<span class=\"token punctuation\">:<\/span><br \/>\n            logger<span class=\"token punctuation\">.<\/span>error<span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u64cd\u4f5c\u5931\u8d25&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span><span class=\"token builtin\">str<\/span><span class=\"token punctuation\">(<\/span>e<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">,<\/span> exc_info<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">)<\/span><br \/>\n            <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u9519\u8bef&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span><span class=\"token builtin\">str<\/span><span class=\"token punctuation\">(<\/span>e<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;__main__&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    main<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<h4>4.8 \u8fd9\u4e9b\u6587\u4ef6\u662f\u5982\u4f55\u534f\u4f5c\u7684&#xff1f;<\/h4>\n<p>Gemini \u591a\u6a21\u6001\u7cfb\u7edf\u7684\u8fd0\u884c\u903b\u8f91\u662f&#xff1a;\u201c\u8f93\u5165\u591a\u6a21\u6001&#xff0c;\u8f6c\u5316\u7686\u5e8f\u5217&#xff0c;\u8f93\u51fa\u7686\u5411\u91cf\u201d\u3002<\/p>\n<p>\u2502<br \/>\n\u251c\u2500\u2500 \u3010\u7528\u6237\u8f93\u5165\u3011<br \/>\n\u2502   \u251c\u2500\u2500 \u5f85\u5904\u7406\u6570\u636e: \u6587\u672c <span class=\"token string\">&#034;\u5915\u9633\u4e0b\u7684\u6d77\u6ee9&#034;<\/span> \/ \u56fe\u7247 <span class=\"token punctuation\">[<\/span>img.jpg<span class=\"token punctuation\">]<\/span> \/ \u89c6\u9891 <span class=\"token punctuation\">[<\/span>vid.mp4<span class=\"token punctuation\">]<\/span><br \/>\n\u2502   \u2514\u2500\u2500 \u76ee\u7684: \u751f\u6210\u5411\u91cf <span class=\"token punctuation\">(<\/span>Embedding<span class=\"token punctuation\">)<\/span> \u6216 \u6267\u884c\u68c0\u7d22 <span class=\"token punctuation\">(<\/span>Retrieval<span class=\"token punctuation\">)<\/span><br \/>\n\u2502<br \/>\n\u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">1<\/span>. \u611f\u77e5\u4e0e\u5e8f\u5217\u5316\u9636\u6bb5 <span class=\"token punctuation\">(<\/span>Perception <span class=\"token operator\">&amp;<\/span> Serialization<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                            \u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8c03\u7528\u6a21\u5757<span class=\"token operator\">&gt;<\/span>: core\/modal_processor.py <span class=\"token punctuation\">(<\/span>\u5e8f\u5217\u5316\u5de5\u5382<span class=\"token punctuation\">)<\/span>         \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u4f9d\u8d56\u914d\u7f6e: config.py <span class=\"token punctuation\">(<\/span>\u5b9a\u4e49 PATCH_SIZE, MAX_SEQ_LEN<span class=\"token punctuation\">)<\/span>   \u2502<br \/>\n\u2502   \u2502                                                        \u2502<br \/>\n\u2502   \u251c\u2500\u2500 A. \u6587\u672c\u6d41 <span class=\"token punctuation\">(<\/span>Text Stream<span class=\"token punctuation\">)<\/span>                              \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u5de5\u5177<span class=\"token operator\">&gt;<\/span>: Tokenizer <span class=\"token punctuation\">(<\/span>\u6765\u81ea model_setup.py<span class=\"token punctuation\">)<\/span>          \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8f93\u51fa: 1D Token IDs <span class=\"token punctuation\">[<\/span><span class=\"token number\">101<\/span>, <span class=\"token number\">2345<\/span>, <span class=\"token number\">889<\/span><span class=\"token punctuation\">..<\/span>.<span class=\"token punctuation\">]<\/span>         \u2502<br \/>\n\u2502   \u2502                                                        \u2502<br \/>\n\u2502   \u251c\u2500\u2500 B. \u89c6\u89c9\u6d41 <span class=\"token punctuation\">(<\/span>Visual Stream &#8211; \u56fe\u50cf\/\u89c6\u9891<span class=\"token punctuation\">)<\/span>                \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u5de5\u5177<span class=\"token operator\">&gt;<\/span>: ImageProcessor <span class=\"token punctuation\">(<\/span>\u6765\u81ea model_setup.py<span class=\"token punctuation\">)<\/span>     \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u52a8\u4f5c<span class=\"token operator\">&gt;<\/span>: Resize &#8211;<span class=\"token operator\">&gt;<\/span> Crop &#8211;<span class=\"token operator\">&gt;<\/span> Patch\u5316                \u2502<br \/>\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 \u56fe\u50cf: 16&#215;16 Patch \u5c55\u5f00                       \u2502<br \/>\n\u2502   \u2502   \u2502   \u2514\u2500\u2500 \u89c6\u9891: \u65f6\u7a7a\u91c7\u6837 <span class=\"token punctuation\">(<\/span>Time-Space Sampling<span class=\"token punctuation\">)<\/span>         \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8f93\u51fa: Visual Tokens <span class=\"token punctuation\">(<\/span>Pixel Values &#043; Time IDs<span class=\"token punctuation\">)<\/span>  \u2502<br \/>\n\u2502                                                            \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u5408\u5e76\u72b6\u6001: \u7edf\u4e00\u7684 PyTorch Tensor <span class=\"token punctuation\">(<\/span>\u9002\u914d Transformer \u8f93\u5165<span class=\"token punctuation\">)<\/span> \u2518<br \/>\n          \u2502<br \/>\n          \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">2<\/span>. \u7edf\u4e00\u7f16\u7801\u9636\u6bb5 <span class=\"token punctuation\">(<\/span>Unified Encoding<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">&lt;<\/span>\u2605 \u6838\u5fc3\/The Brain<span class=\"token operator\">&gt;<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                            \u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8c03\u7528\u6a21\u5757<span class=\"token operator\">&gt;<\/span>: core\/embedding.py <span class=\"token punctuation\">(<\/span>\u5411\u91cf\u751f\u6210\u5668<span class=\"token punctuation\">)<\/span>               \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u4f9d\u8d56\u6a21\u578b: core\/model_setup.py <span class=\"token punctuation\">(<\/span>\u52a0\u8f7d\u5171\u4eab Transformer<span class=\"token punctuation\">)<\/span> \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 \u6743\u91cd: google\/gemma-2-9b-it                       \u2502<br \/>\n\u2502   \u2502                                                        \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u2699 \u524d\u5411\u4f20\u64ad <span class=\"token punctuation\">(<\/span>Forward Pass<span class=\"token punctuation\">)<\/span>                            \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 \u8f93\u5165: \u5f02\u6784\u7684 Token \u5e8f\u5217                          \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 \u673a\u5236: \u5171\u4eab\u81ea\u6ce8\u610f\u529b <span class=\"token punctuation\">(<\/span>Self-Attention<span class=\"token punctuation\">)<\/span>              \u2502<br \/>\n\u2502   \u2502   \u2502   <span class=\"token punctuation\">(<\/span>\u6587\u672cToken\u4e0e\u89c6\u89c9Token\u5728\u540c\u4e00\u5c42\u8fdb\u884c\u4ea4\u4e92<span class=\"token punctuation\">)<\/span>           \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u539f\u59cb\u8f93\u51fa: <span class=\"token punctuation\">[<\/span>Batch, Seq_Len, Hidden_Dim<span class=\"token punctuation\">]<\/span>         \u2502<br \/>\n\u2502   \u2502                                                        \u2502<br \/>\n\u2502   \u2514\u2500\u2500 &#x1f4cf; \u7ef4\u5ea6\u5bf9\u9f50 <span class=\"token punctuation\">(<\/span>Projection <span class=\"token operator\">&amp;<\/span> Normalization<span class=\"token punctuation\">)<\/span>             \u2502<br \/>\n\u2502       \u251c\u2500\u2500 \u6295\u5f71: Linear\u5c42\u5c06\u7ef4\u5ea6\u5f3a\u5236\u8f6c\u4e3a config.D_MODEL      \u2502<br \/>\n\u2502       \u251c\u2500\u2500 \u5f52\u4e00\u5316: L2 Normalize <span class=\"token punctuation\">(<\/span>\u786e\u4fdd\u6a21\u6001\u95f4\u8ddd\u79bb\u53ef\u6bd4<span class=\"token punctuation\">)<\/span>        \u2502<br \/>\n\u2502       \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u6700\u7ec8\u4ea7\u7269: \u7edf\u4e00\u5411\u91cf <span class=\"token punctuation\">(<\/span>Unified Vector, <span class=\"token number\">2048<\/span>\u7ef4<span class=\"token punctuation\">)<\/span>    \u2502<br \/>\n\u2502                                                            \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8f93\u51fa\u7ed3\u679c: Numpy Array <span class=\"token punctuation\">[<\/span><span class=\"token number\">0.12<\/span>, -0.56, <span class=\"token number\">0.99<\/span>, <span class=\"token punctuation\">..<\/span>.<span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n          \u2502<br \/>\n          \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">3<\/span>. \u5b58\u50a8\u4e0e\u68c0\u7d22\u9636\u6bb5 <span class=\"token punctuation\">(<\/span>Storage <span class=\"token operator\">&amp;<\/span> Retrieval<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                            \u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8c03\u7528\u6a21\u5757<span class=\"token operator\">&gt;<\/span>: core\/retriever.py <span class=\"token punctuation\">(<\/span>\u6df7\u5408\u8bb0\u5fc6\u7ba1\u5bb6<span class=\"token punctuation\">)<\/span>             \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u4f9d\u8d56\u5b58\u50a8: embeddings_db\/chroma <span class=\"token punctuation\">(<\/span>\u5411\u91cf\u6570\u636e\u5e93<span class=\"token punctuation\">)<\/span>          \u2502<br \/>\n\u2502   \u2502                                                        \u2502<br \/>\n\u2502   \u251c\u2500\u2500 A. \u5199\u5165\u6a21\u5f0f <span class=\"token punctuation\">(<\/span>Indexing<span class=\"token punctuation\">)<\/span>                               \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 \u52a8\u4f5c: \u5c06\u5411\u91cf &#043; \u5143\u6570\u636e <span class=\"token punctuation\">(<\/span>Metadata<span class=\"token punctuation\">)<\/span> \u5b58\u5165\u96c6\u5408        \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 \u7ed3\u679c: \u6301\u4e45\u5316\u5230\u78c1\u76d8 <span class=\"token punctuation\">(<\/span>Chroma Persist<span class=\"token punctuation\">)<\/span>              \u2502<br \/>\n\u2502   \u2502                                                        \u2502<br \/>\n\u2502   \u2514\u2500\u2500 B. \u67e5\u8be2\u6a21\u5f0f <span class=\"token punctuation\">(<\/span>Searching<span class=\"token punctuation\">)<\/span>                              \u2502<br \/>\n\u2502       \u251c\u2500\u2500 \u8f93\u5165: \u67e5\u8be2\u5411\u91cf <span class=\"token punctuation\">(<\/span>Query Vector<span class=\"token punctuation\">)<\/span>                    \u2502<br \/>\n\u2502       \u251c\u2500\u2500 \u8ba1\u7b97: \u4f59\u5f26\u76f8\u4f3c\u5ea6 <span class=\"token punctuation\">(<\/span>Cosine Similarity<span class=\"token punctuation\">)<\/span>             \u2502<br \/>\n\u2502       \u2502   <span class=\"token punctuation\">(<\/span>\u5728\u540c\u4e00\u7a7a\u95f4\u5185\u8ba1\u7b97 \u6587\u672c-\u56fe \/ \u56fe-\u89c6\u9891 \u7684\u8ddd\u79bb<span class=\"token punctuation\">)<\/span>      \u2502<br \/>\n\u2502       \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u6700\u7ec8\u7ed3\u679c: Top-K \u5339\u914d\u9879 <span class=\"token punctuation\">(<\/span>\u5982: \u7528\u6587\u5b57\u641c\u5230\u7684\u89c6\u9891<span class=\"token punctuation\">)<\/span>  \u2502<br \/>\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<\/p>\n<h5>4.2.2 \u534f\u4f5c\u7ec6\u8282\u6df1\u5ea6\u89e3\u6790<\/h5>\n<p>\u4ee5\u4e0b\u662f\u8fd9\u4e9b\u4ee3\u7801\u6587\u4ef6\u5728\u4e00\u6b21\u201c\u8de8\u6a21\u6001\u68c0\u7d22\u201d\u4efb\u52a1&#xff08;\u4f8b\u5982&#xff1a;\u7528\u6587\u5b57\u641c\u89c6\u9891&#xff09;\u4e2d\u7684\u5177\u4f53\u63e1\u624b\u8fc7\u7a0b&#xff1a;<\/p>\n<p>1. \u9884\u5904\u7406\u534f\u4f5c (Processor &amp; Config)<\/p>\n<ul>\n<li>\u8f93\u5165&#xff1a;\u7528\u6237\u8f93\u5165\u6587\u5b57 query &#061; \u201c\u7ea2\u8272\u8dd1\u8f66\u5728\u516c\u8def\u4e0a\u201d\u3002<\/li>\n<li>\u534f\u4f5c\u903b\u8f91&#xff1a;\n<ul>\n<li>main.py \u63a5\u6536\u6307\u4ee4&#xff0c;\u8c03\u7528 core\/modal_processor.py\u3002<\/li>\n<li>modal_processor.py \u8bfb\u53d6 config.py \u4e2d\u7684 MAX_SEQ_LEN&#061;4096&#xff0c;\u786e\u4fdd\u751f\u6210\u7684\u5e8f\u5217\u4e0d\u4f1a\u7206\u663e\u5b58\u3002<\/li>\n<li>\u5b83\u5411 core\/model_setup.py \u501f\u7528\u521d\u59cb\u5316\u597d\u7684 Tokenizer&#xff0c;\u5c06\u6587\u5b57\u5207\u5206\u4e3a\u6570\u5b57 ID\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\u4ea7\u7269&#xff1a;\u4e00\u4e2a\u6807\u51c6\u7684 PyTorch Tensor&#xff0c;\u51c6\u5907\u9001\u5165\u663e\u5361\u3002<\/li>\n<\/ul>\n<p>2. \u6838\u5fc3\u7f16\u7801\u534f\u4f5c (Embedding &amp; Model) \u8fd9\u662f Gemini \u67b6\u6784\u7684\u7075\u9b42&#xff0c;\u4e5f\u662f**\u201c\u591a\u6a21\u6001\u7edf\u4e00\u201d**\u53d1\u751f\u7684\u7269\u7406\u573a\u6240\u3002<\/p>\n<ul>\n<li>\u534f\u4f5c\u903b\u8f91&#xff1a;\n<ul>\n<li>core\/embedding.py \u83b7\u53d6\u5904\u7406\u597d\u7684 Tensor&#xff0c;\u5c06\u5176\u5582\u7ed9 core\/model_setup.py \u52a0\u8f7d\u7684 \u5171\u4eab Transformer \u6a21\u578b\u3002<\/li>\n<li>\u5173\u952e\u70b9&#xff1a;\u6b64\u65f6\u6a21\u578b\u5e76\u4e0d\u533a\u5206\u8fd9\u662f\u201c\u6587\u5b57\u201d\u8fd8\u662f\u201c\u89c6\u9891\u201d\u3002\u5728\u6a21\u578b\u773c\u4e2d&#xff0c;\u5b83\u4eec\u90fd\u662f\u9700\u8981\u8fdb\u884c\u6ce8\u610f\u529b\u8ba1\u7b97\u7684 Token\u3002<\/li>\n<li>\u5982\u679c\u8f93\u5165\u662f\u89c6\u9891&#xff0c;embedding.py \u4f1a\u7279\u522b\u5904\u7406\u8f93\u51fa\u7684\u7ef4\u5ea6&#xff08;\u5bf9\u65f6\u95f4\u8f74\u548c\u7a7a\u95f4\u8f74\u53d6\u5e73\u5747\u6c60\u5316&#xff09;&#xff0c;\u5e76\u6839\u636e config.py \u4e2d\u7684 D_MODEL \u5f3a\u884c\u5c06\u5411\u91cf\u957f\u5ea6\u538b\u5236\u4e3a 2048 \u7ef4\u3002<\/li>\n<li>\u6700\u540e&#xff0c;\u6267\u884c L2 \u5f52\u4e00\u5316\u3002\u8fd9\u4e00\u6b65\u81f3\u5173\u91cd\u8981&#xff0c;\u5b83\u628a\u6240\u6709\u5411\u91cf\u62c9\u4f38\u5230\u5355\u4f4d\u957f\u5ea6&#xff0c;\u4f7f\u5f97\u8ba1\u7b97\u76f8\u4f3c\u5ea6&#xff08;\u5939\u89d2\u4f59\u5f26&#xff09;\u6210\u4e3a\u53ef\u80fd\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>3. \u8bb0\u5fc6\u68c0\u7d22\u534f\u4f5c (Retriever &amp; DB)<\/p>\n<ul>\n<li>\u534f\u4f5c\u903b\u8f91&#xff1a;\n<ul>\n<li>core\/retriever.py \u5524\u9192 embeddings_db\/chroma \u76ee\u5f55\u4e0b\u7684\u5411\u91cf\u6570\u636e\u5e93\u3002<\/li>\n<li>\u5b83\u63a5\u6536 embedding.py \u5410\u51fa\u7684\u90a3\u4e2a 2048 \u7ef4\u5411\u91cf\u3002<\/li>\n<li>\u5b83\u5728\u6570\u636e\u5e93\u4e2d\u5feb\u901f\u626b\u63cf\u51e0\u767e\u4e07\u4e2a\u5df2\u5b58\u50a8\u7684\u5411\u91cf&#xff08;\u53ef\u80fd\u662f\u56fe\u7247\u751f\u6210\u7684&#xff0c;\u4e5f\u53ef\u80fd\u662f\u89c6\u9891\u751f\u6210\u7684&#xff09;\u3002<\/li>\n<li>\u9b54\u6cd5\u65f6\u523b&#xff1a;\u56e0\u4e3a\u5b83\u5728\u6570\u5b66\u4e0a\u627e\u5230\u4e86\u8ddd\u79bb\u6700\u8fd1\u7684\u5411\u91cf&#xff0c;\u800c\u8be5\u5411\u91cf\u7684\u5143\u6570\u636e\u6307\u5411\u4e00\u4e2a .mp4 \u6587\u4ef6&#xff0c;\u7cfb\u7edf\u6210\u529f\u5b9e\u73b0\u4e86\u201c\u7528\u6587\u5b57\u641c\u7d22\u89c6\u9891\u201d\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>\u603b\u7ed3<\/p>\n<ul>\n<li>config.py \u662f\u5baa\u6cd5&#xff1a;\u89c4\u5b9a\u4e86\u6240\u6709\u6a21\u6001\u5fc5\u987b\u9075\u5b88\u7684\u5c3a\u5bf8\u548c\u6807\u51c6\u3002<\/li>\n<li>model_setup.py \u662f\u5927\u8111&#xff1a;\u63d0\u4f9b\u4e86\u7406\u89e3\u6240\u6709\u6570\u636e\u7684\u5171\u4eab\u795e\u7ecf\u7f51\u7edc\u6743\u91cd\u3002<\/li>\n<li>modal_processor.py \u662f\u7ffb\u8bd1\u5b98&#xff1a;\u628a\u4eba\u7c7b\u80fd\u770b\u61c2\u7684\u56fe\/\u6587\/\u89c6&#xff0c;\u7ffb\u8bd1\u6210\u5927\u8111\u80fd\u61c2\u7684\u5e8f\u5217\u3002<\/li>\n<li>embedding.py \u662f\u538b\u7f29\u673a&#xff1a;\u628a\u5e9e\u5927\u7684\u5e8f\u5217\u538b\u7f29\u6210\u4e00\u4e2a\u7cbe\u7b80\u7684\u6570\u5b66\u5411\u91cf\u3002<\/li>\n<li>retriever.py \u662f\u56fe\u4e66\u7ba1\u7406\u5458&#xff1a;\u8d1f\u8d23\u628a\u8fd9\u4e9b\u5411\u91cf\u5206\u95e8\u522b\u7c7b\u5730\u5b58\u597d&#xff0c;\u5e76\u5728\u9700\u8981\u65f6\u5feb\u901f\u627e\u5230\u3002<\/li>\n<\/ul>\n<h3>\u4e94\u3001\u591a\u6a21\u6001\u8868\u793a\u7684\u9a8c\u8bc1\u4e0e\u4f18\u5316<\/h3>\n<h4>5.1 \u8868\u793a\u7edf\u4e00\u7684\u9a8c\u8bc1\u65b9\u6cd5<\/h4>\n<h5>5.1.1 \u5b9a\u6027\u9a8c\u8bc1&#xff1a;\u8de8\u6a21\u6001\u8bed\u4e49\u4e00\u81f4\u6027<\/h5>\n<p>\u901a\u8fc7\u4eba\u5de5\u6807\u6ce8\u7684 \u201c\u6587\u672c &#8211; \u56fe\u50cf &#8211; \u89c6\u9891\u201d \u4e09\u5143\u7ec4&#xff0c;\u9a8c\u8bc1\u540c\u4e00\u8bed\u4e49\u5185\u5bb9\u7684\u5411\u91cf\u76f8\u4f3c\u5ea6\u662f\u5426\u9ad8\u4e8e\u4e0d\u540c\u8bed\u4e49&#xff1a;<\/p>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">def<\/span> <span class=\"token function\">validate_semantic_consistency<\/span><span class=\"token punctuation\">(<\/span>embedder<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u9a8c\u8bc1\u8de8\u6a21\u6001\u8bed\u4e49\u4e00\u81f4\u6027&#034;&#034;&#034;<\/span><br \/>\n    <span class=\"token comment\"># \u6b63\u4f8b&#xff1a;\u540c\u4e00\u8bed\u4e49\u7684\u4e0d\u540c\u6a21\u6001<\/span><br \/>\n    positive_samples <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">{<\/span><br \/>\n        <span class=\"token string\">&#034;text&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token string\">&#034;\u5c0f\u732b\u8ffd\u7740\u6bdb\u7ebf\u7403\u8dd1&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        <span class=\"token string\">&#034;image&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token string\">&#034;data\/images\/cat_ball.jpg&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        <span class=\"token string\">&#034;video&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token string\">&#034;data\/videos\/cat_ball.mp4&#034;<\/span><br \/>\n    <span class=\"token punctuation\">}<\/span><\/p>\n<p>    <span class=\"token comment\"># \u8d1f\u4f8b&#xff1a;\u4e0d\u540c\u8bed\u4e49\u7684\u6a21\u6001<\/span><br \/>\n    negative_samples <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">{<\/span><br \/>\n        <span class=\"token string\">&#034;text&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token string\">&#034;\u98de\u673a\u5728\u5929\u7a7a\u4e2d\u98de\u884c&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        <span class=\"token string\">&#034;image&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token string\">&#034;data\/images\/plane.jpg&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        <span class=\"token string\">&#034;video&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token string\">&#034;data\/videos\/plane_fly.mp4&#034;<\/span><br \/>\n    <span class=\"token punctuation\">}<\/span><\/p>\n<p>    <span class=\"token comment\"># \u751f\u6210\u6b63\u4f8b\u5d4c\u5165<\/span><br \/>\n    pos_text_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_text_embedding<span class=\"token punctuation\">(<\/span>positive_samples<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;text&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    pos_image_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_image_embedding<span class=\"token punctuation\">(<\/span>positive_samples<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;image&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    pos_video_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_video_embedding<span class=\"token punctuation\">(<\/span>positive_samples<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;video&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u751f\u6210\u8d1f\u4f8b\u5d4c\u5165<\/span><br \/>\n    neg_text_emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_text_embedding<span class=\"token punctuation\">(<\/span>negative_samples<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;text&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u8ba1\u7b97\u76f8\u4f3c\u5ea6<\/span><br \/>\n    pos_text_image <span class=\"token operator\">&#061;<\/span> np<span class=\"token punctuation\">.<\/span>dot<span class=\"token punctuation\">(<\/span>pos_text_emb<span class=\"token punctuation\">,<\/span> pos_image_emb<span class=\"token punctuation\">)<\/span><br \/>\n    pos_text_video <span class=\"token operator\">&#061;<\/span> np<span class=\"token punctuation\">.<\/span>dot<span class=\"token punctuation\">(<\/span>pos_text_emb<span class=\"token punctuation\">,<\/span> pos_video_emb<span class=\"token punctuation\">)<\/span><br \/>\n    neg_text_image <span class=\"token operator\">&#061;<\/span> np<span class=\"token punctuation\">.<\/span>dot<span class=\"token punctuation\">(<\/span>neg_text_emb<span class=\"token punctuation\">,<\/span> pos_image_emb<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;&#061;&#061;&#061; \u8bed\u4e49\u4e00\u81f4\u6027\u9a8c\u8bc1 &#061;&#061;&#061;&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u6b63\u4f8b&#xff1a;\u6587\u672c-\u56fe\u50cf\u76f8\u4f3c\u5ea6 &#061; <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>pos_text_image<span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u6b63\u4f8b&#xff1a;\u6587\u672c-\u89c6\u9891\u76f8\u4f3c\u5ea6 &#061; <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>pos_text_video<span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;\u8d1f\u4f8b&#xff1a;\u6587\u672c-\u56fe\u50cf\u76f8\u4f3c\u5ea6 &#061; <\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>neg_text_image<span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u9a8c\u8bc1\u9608\u503c&#xff08;\u7406\u60f3\u60c5\u51b5\u4e0b\u6b63\u4f8b&gt;0.7&#xff0c;\u8d1f\u4f8b&lt;0.3&#xff09;<\/span><br \/>\n    <span class=\"token keyword\">assert<\/span> pos_text_image <span class=\"token operator\">&gt;<\/span> <span class=\"token number\">0.7<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#034;\u6b63\u4f8b\u6587\u672c-\u56fe\u50cf\u76f8\u4f3c\u5ea6\u8fc7\u4f4e&#034;<\/span><br \/>\n    <span class=\"token keyword\">assert<\/span> pos_text_video <span class=\"token operator\">&gt;<\/span> <span class=\"token number\">0.7<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#034;\u6b63\u4f8b\u6587\u672c-\u89c6\u9891\u76f8\u4f3c\u5ea6\u8fc7\u4f4e&#034;<\/span><br \/>\n    <span class=\"token keyword\">assert<\/span> neg_text_image <span class=\"token operator\">&lt;<\/span> <span class=\"token number\">0.3<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#034;\u8d1f\u4f8b\u76f8\u4f3c\u5ea6\u8fc7\u9ad8&#034;<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\u9a8c\u8bc1\u901a\u8fc7&#xff01;&#034;<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<h5>5.1.2 \u5b9a\u91cf\u9a8c\u8bc1&#xff1a;\u68c0\u7d22\u51c6\u786e\u7387<\/h5>\n<p>\u4f7f\u7528\u6807\u51c6\u591a\u6a21\u6001\u6570\u636e\u96c6&#xff08;\u5982 MSCOCO\u3001Flickr30k&#xff09;\u8ba1\u7b97\u8de8\u6a21\u6001\u68c0\u7d22\u7684 Top-K \u51c6\u786e\u7387&#xff1a;<\/p>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">def<\/span> <span class=\"token function\">calculate_retrieval_accuracy<\/span><span class=\"token punctuation\">(<\/span>retriever<span class=\"token punctuation\">,<\/span> test_queries<span class=\"token punctuation\">,<\/span> test_ground_truth<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;\u8ba1\u7b97\u8de8\u6a21\u6001\u68c0\u7d22\u51c6\u786e\u7387&#034;&#034;&#034;<\/span><br \/>\n    top1_correct <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">0<\/span><br \/>\n    top5_correct <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">0<\/span><br \/>\n    total <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">len<\/span><span class=\"token punctuation\">(<\/span>test_queries<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">for<\/span> query_id<span class=\"token punctuation\">,<\/span> query_emb <span class=\"token keyword\">in<\/span> test_queries<span class=\"token punctuation\">.<\/span>items<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token comment\"># \u68c0\u7d22<\/span><br \/>\n        results <span class=\"token operator\">&#061;<\/span> retriever<span class=\"token punctuation\">.<\/span>retrieve<span class=\"token punctuation\">(<\/span>query_emb<span class=\"token punctuation\">)<\/span><br \/>\n        retrieved_ids <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">[<\/span>res<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;id&#034;<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token keyword\">for<\/span> res <span class=\"token keyword\">in<\/span> results<span class=\"token punctuation\">]<\/span><\/p>\n<p>        <span class=\"token comment\"># \u68c0\u67e5Top-1<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> retrieved_ids<span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token keyword\">in<\/span> test_ground_truth<span class=\"token punctuation\">[<\/span>query_id<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            top1_correct <span class=\"token operator\">&#043;&#061;<\/span> <span class=\"token number\">1<\/span><\/p>\n<p>        <span class=\"token comment\"># \u68c0\u67e5Top-5<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> <span class=\"token builtin\">any<\/span><span class=\"token punctuation\">(<\/span><span class=\"token builtin\">id<\/span> <span class=\"token keyword\">in<\/span> test_ground_truth<span class=\"token punctuation\">[<\/span>query_id<span class=\"token punctuation\">]<\/span> <span class=\"token keyword\">for<\/span> <span class=\"token builtin\">id<\/span> <span class=\"token keyword\">in<\/span> retrieved_ids<span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">:<\/span><span class=\"token number\">5<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            top5_correct <span class=\"token operator\">&#043;&#061;<\/span> <span class=\"token number\">1<\/span><\/p>\n<p>    <span class=\"token comment\"># \u8ba1\u7b97\u51c6\u786e\u7387<\/span><br \/>\n    top1_acc <span class=\"token operator\">&#061;<\/span> top1_correct <span class=\"token operator\">\/<\/span> total<br \/>\n    top5_acc <span class=\"token operator\">&#061;<\/span> top5_correct <span class=\"token operator\">\/<\/span> total<\/p>\n<p>    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;Top-1 \u51c6\u786e\u7387&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>top1_acc<span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">print<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string-interpolation\"><span class=\"token string\">f&#034;Top-5 \u51c6\u786e\u7387&#xff1a;<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span>top5_acc<span class=\"token punctuation\">:<\/span><span class=\"token format-spec\">.4f<\/span><span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">return<\/span> top1_acc<span class=\"token punctuation\">,<\/span> top5_acc<\/p>\n<h4>5.2 \u6027\u80fd\u4f18\u5316\u6280\u5de7<\/h4>\n<h5>5.2.1 \u6a21\u578b\u4f18\u5316<\/h5>\n<li>\n<p>\u91cf\u5316&#xff1a;\u4f7f\u7528 INT8\/INT4 \u91cf\u5316\u51cf\u5c11\u663e\u5b58\u5360\u7528&#xff08;\u901f\u5ea6\u63d0\u5347 2-4 \u500d&#xff09;<\/p>\n<p>\u8fd0\u884c<\/p>\n<p> <span class=\"token keyword\">from<\/span> transformers <span class=\"token keyword\">import<\/span> BitsAndBytesConfig<\/p>\n<p><span class=\"token comment\"># 4-bit\u91cf\u5316\u914d\u7f6e<\/span><br \/>\nbnb_config <span class=\"token operator\">&#061;<\/span> BitsAndBytesConfig<span class=\"token punctuation\">(<\/span><br \/>\n    load_in_4bit<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    bnb_4bit_use_double_quant<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    bnb_4bit_quant_type<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;nf4&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    bnb_4bit_compute_dtype<span class=\"token operator\">&#061;<\/span>torch<span class=\"token punctuation\">.<\/span>float16<br \/>\n<span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># \u52a0\u8f7d\u91cf\u5316\u6a21\u578b<\/span><br \/>\nmodel <span class=\"token operator\">&#061;<\/span> AutoModel<span class=\"token punctuation\">.<\/span>from_pretrained<span class=\"token punctuation\">(<\/span><br \/>\n    MODEL_NAME<span class=\"token punctuation\">,<\/span><br \/>\n    quantization_config<span class=\"token operator\">&#061;<\/span>bnb_config<span class=\"token punctuation\">,<\/span><br \/>\n    device_map<span class=\"token operator\">&#061;<\/span>DEVICE<br \/>\n<span class=\"token punctuation\">)<\/span>\n <\/li>\n<li>\n<p>\u6a21\u578b\u84b8\u998f&#xff1a;\u4f7f\u7528\u5927\u6a21\u578b\u84b8\u998f\u51fa\u5c0f\u6a21\u578b&#xff0c;\u4fdd\u6301\u8868\u793a\u80fd\u529b\u7684\u540c\u65f6\u63d0\u5347\u901f\u5ea6<\/p>\n<\/li>\n<h5>5.2.2 \u63a8\u7406\u4f18\u5316<\/h5>\n<li>\u6279\u5904\u7406&#xff1a;\u6279\u91cf\u5904\u7406\u591a\u6a21\u6001\u6570\u636e&#xff0c;\u63d0\u5347 GPU \u5229\u7528\u7387<\/li>\n<li>\u7f13\u5b58\u673a\u5236&#xff1a;\u7f13\u5b58\u9ad8\u9891\u8bbf\u95ee\u7684\u5d4c\u5165\u5411\u91cf&#xff0c;\u907f\u514d\u91cd\u590d\u8ba1\u7b97<\/li>\n<li>\u5f02\u6b65\u5904\u7406&#xff1a;\u89c6\u9891\u62bd\u5e27\u4e0e\u6a21\u578b\u63a8\u7406\u5f02\u6b65\u6267\u884c&#xff0c;\u51cf\u5c11\u7b49\u5f85\u65f6\u95f4<\/li>\n<h5>5.2.3 \u5b58\u50a8\u4f18\u5316<\/h5>\n<li>\n<p>\u5411\u91cf\u538b\u7f29&#xff1a;\u4f7f\u7528 PCA \/ \u91cf\u5316\u5c06 2048 \u7ef4\u5411\u91cf\u538b\u7f29\u81f3 512 \u7ef4&#xff08;\u7cbe\u5ea6\u635f\u5931 &lt; 5%&#xff09;<\/p>\n<p>\u8fd0\u884c<\/p>\n<p> <span class=\"token keyword\">from<\/span> sklearn<span class=\"token punctuation\">.<\/span>decomposition <span class=\"token keyword\">import<\/span> PCA<\/p>\n<p><span class=\"token comment\"># \u8bad\u7ec3PCA\u6a21\u578b&#xff08;\u4f7f\u7528\u6837\u672c\u5d4c\u5165&#xff09;<\/span><br \/>\npca <span class=\"token operator\">&#061;<\/span> PCA<span class=\"token punctuation\">(<\/span>n_components<span class=\"token operator\">&#061;<\/span><span class=\"token number\">512<\/span><span class=\"token punctuation\">)<\/span><br \/>\npca<span class=\"token punctuation\">.<\/span>fit<span class=\"token punctuation\">(<\/span>all_sample_embeddings<span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># all_sample_embeddings\u4e3a\u6837\u672c\u5d4c\u5165\u77e9\u9635<\/span><\/p>\n<p><span class=\"token comment\"># \u538b\u7f29\u5d4c\u5165<\/span><br \/>\ncompressed_emb <span class=\"token operator\">&#061;<\/span> pca<span class=\"token punctuation\">.<\/span>transform<span class=\"token punctuation\">(<\/span>original_emb<span class=\"token punctuation\">)<\/span>\n <\/li>\n<li>\n<p>\u5206\u5c42\u5b58\u50a8&#xff1a;\u9ad8\u9891\u8bbf\u95ee\u7684\u5d4c\u5165\u5b58\u5165\u5185\u5b58&#xff0c;\u4f4e\u9891\u5b58\u5165\u78c1\u76d8<\/p>\n<\/li>\n<h3>\u516d\u3001\u751f\u4ea7\u7ea7\u90e8\u7f72\u6700\u4f73\u5b9e\u8df5<\/h3>\n<h4>6.1 \u670d\u52a1\u5316\u5c01\u88c5&#xff08;FastAPI&#xff09;<\/h4>\n<p>\u8fd0\u884c<\/p>\n<p><span class=\"token keyword\">from<\/span> fastapi <span class=\"token keyword\">import<\/span> FastAPI<span class=\"token punctuation\">,<\/span> UploadFile<span class=\"token punctuation\">,<\/span> File<br \/>\n<span class=\"token keyword\">from<\/span> fastapi<span class=\"token punctuation\">.<\/span>middleware<span class=\"token punctuation\">.<\/span>cors <span class=\"token keyword\">import<\/span> CORSMiddleware<br \/>\n<span class=\"token keyword\">import<\/span> uvicorn<br \/>\n<span class=\"token keyword\">import<\/span> numpy <span class=\"token keyword\">as<\/span> np<br \/>\n<span class=\"token keyword\">from<\/span> core<span class=\"token punctuation\">.<\/span>embedding <span class=\"token keyword\">import<\/span> MultimodalEmbedding<br \/>\n<span class=\"token keyword\">from<\/span> core<span class=\"token punctuation\">.<\/span>retriever <span class=\"token keyword\">import<\/span> MultimodalRetriever<\/p>\n<p><span class=\"token comment\"># \u521d\u59cb\u5316FastAPI<\/span><br \/>\napp <span class=\"token operator\">&#061;<\/span> FastAPI<span class=\"token punctuation\">(<\/span>title<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;Gemini Multimodal API&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\napp<span class=\"token punctuation\">.<\/span>add_middleware<span class=\"token punctuation\">(<\/span><br \/>\n    CORSMiddleware<span class=\"token punctuation\">,<\/span><br \/>\n    allow_origins<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;*&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    allow_credentials<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    allow_methods<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;*&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    allow_headers<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;*&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n<span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># \u9884\u52a0\u8f7d\u7ec4\u4ef6<\/span><br \/>\nembedder <span class=\"token operator\">&#061;<\/span> MultimodalEmbedding<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\nretriever <span class=\"token operator\">&#061;<\/span> MultimodalRetriever<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># \u5065\u5eb7\u68c0\u67e5\u63a5\u53e3<\/span><br \/>\n<span class=\"token decorator annotation punctuation\">&#064;app<span class=\"token punctuation\">.<\/span>get<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\/health&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">async<\/span> <span class=\"token keyword\">def<\/span> <span class=\"token function\">health_check<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">return<\/span> <span class=\"token punctuation\">{<\/span><span class=\"token string\">&#034;status&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token string\">&#034;healthy&#034;<\/span><span class=\"token punctuation\">}<\/span><\/p>\n<p><span class=\"token comment\"># \u6587\u672c\u5d4c\u5165\u63a5\u53e3<\/span><br \/>\n<span class=\"token decorator annotation punctuation\">&#064;app<span class=\"token punctuation\">.<\/span>post<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\/embed\/text&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">async<\/span> <span class=\"token keyword\">def<\/span> <span class=\"token function\">embed_text<\/span><span class=\"token punctuation\">(<\/span>text<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">str<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_text_embedding<span class=\"token punctuation\">(<\/span>text<span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">return<\/span> <span class=\"token punctuation\">{<\/span><br \/>\n        <span class=\"token string\">&#034;embedding&#034;<\/span><span class=\"token punctuation\">:<\/span> emb<span class=\"token punctuation\">.<\/span>tolist<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        <span class=\"token string\">&#034;dim&#034;<\/span><span class=\"token punctuation\">:<\/span> emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        <span class=\"token string\">&#034;modal_type&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token string\">&#034;text&#034;<\/span><br \/>\n    <span class=\"token punctuation\">}<\/span><\/p>\n<p><span class=\"token comment\"># \u56fe\u50cf\u5d4c\u5165\u63a5\u53e3&#xff08;\u652f\u6301\u6587\u4ef6\u4e0a\u4f20&#xff09;<\/span><br \/>\n<span class=\"token decorator annotation punctuation\">&#064;app<span class=\"token punctuation\">.<\/span>post<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\/embed\/image&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">async<\/span> <span class=\"token keyword\">def<\/span> <span class=\"token function\">embed_image<\/span><span class=\"token punctuation\">(<\/span><span class=\"token builtin\">file<\/span><span class=\"token punctuation\">:<\/span> UploadFile <span class=\"token operator\">&#061;<\/span> File<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">.<\/span><span class=\"token punctuation\">.<\/span><span class=\"token punctuation\">.<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token comment\"># \u4fdd\u5b58\u4e0a\u4f20\u6587\u4ef6<\/span><br \/>\n    file_path <span class=\"token operator\">&#061;<\/span> <span class=\"token string-interpolation\"><span class=\"token string\">f&#034;temp\/<\/span><span class=\"token interpolation\"><span class=\"token punctuation\">{<\/span><span class=\"token builtin\">file<\/span><span class=\"token punctuation\">.<\/span>filename<span class=\"token punctuation\">}<\/span><\/span><span class=\"token string\">&#034;<\/span><\/span><br \/>\n    <span class=\"token keyword\">with<\/span> <span class=\"token builtin\">open<\/span><span class=\"token punctuation\">(<\/span>file_path<span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#034;wb&#034;<\/span><span class=\"token punctuation\">)<\/span> <span class=\"token keyword\">as<\/span> f<span class=\"token punctuation\">:<\/span><br \/>\n        f<span class=\"token punctuation\">.<\/span>write<span class=\"token punctuation\">(<\/span><span class=\"token keyword\">await<\/span> <span class=\"token builtin\">file<\/span><span class=\"token punctuation\">.<\/span>read<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token comment\"># \u751f\u6210\u5d4c\u5165<\/span><br \/>\n    emb <span class=\"token operator\">&#061;<\/span> embedder<span class=\"token punctuation\">.<\/span>get_image_embedding<span class=\"token punctuation\">(<\/span>file_path<span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">return<\/span> <span class=\"token punctuation\">{<\/span><br \/>\n        <span class=\"token string\">&#034;embedding&#034;<\/span><span class=\"token punctuation\">:<\/span> emb<span class=\"token punctuation\">.<\/span>tolist<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        <span class=\"token string\">&#034;dim&#034;<\/span><span class=\"token punctuation\">:<\/span> emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        <span class=\"token string\">&#034;modal_type&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token string\">&#034;image&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n        <span class=\"token string\">&#034;filename&#034;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">file<\/span><span class=\"token punctuation\">.<\/span>filename<br \/>\n    <span class=\"token punctuation\">}<\/span><\/p>\n<p><span class=\"token comment\"># \u8de8\u6a21\u6001\u68c0\u7d22\u63a5\u53e3<\/span><br \/>\n<span class=\"token decorator annotation punctuation\">&#064;app<span class=\"token punctuation\">.<\/span>post<\/span><span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;\/retrieve&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n<span class=\"token keyword\">async<\/span> <span class=\"token keyword\">def<\/span> <span class=\"token function\">retrieve<\/span><span class=\"token punctuation\">(<\/span>embedding<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">list<\/span><span class=\"token punctuation\">[<\/span><span class=\"token builtin\">float<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span> modal_type<span class=\"token punctuation\">:<\/span> <span class=\"token builtin\">str<\/span> <span class=\"token operator\">&#061;<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    query_emb <span class=\"token operator\">&#061;<\/span> np<span class=\"token punctuation\">.<\/span>array<span class=\"token punctuation\">(<\/span>embedding<span class=\"token punctuation\">)<\/span><br \/>\n    results <span class=\"token operator\">&#061;<\/span> retriever<span class=\"token punctuation\">.<\/span>retrieve<span class=\"token punctuation\">(<\/span>query_emb<span class=\"token punctuation\">,<\/span> filter_modal_type<span class=\"token operator\">&#061;<\/span>modal_type<span class=\"token punctuation\">)<\/span><br \/>\n    <span class=\"token keyword\">return<\/span> <span class=\"token punctuation\">{<\/span><span class=\"token string\">&#034;results&#034;<\/span><span class=\"token punctuation\">:<\/span> results<span class=\"token punctuation\">}<\/span><\/p>\n<p><span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;__main__&#034;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    uvicorn<span class=\"token punctuation\">.<\/span>run<span class=\"token punctuation\">(<\/span>app<span class=\"token punctuation\">,<\/span> host<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;0.0.0.0&#034;<\/span><span class=\"token punctuation\">,<\/span> port<span class=\"token operator\">&#061;<\/span><span class=\"token number\">8000<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<h4>6.2 \u76d1\u63a7\u4e0e\u65e5\u5fd7<\/h4>\n<li>\u6027\u80fd\u76d1\u63a7&#xff1a;\u4f7f\u7528 Prometheus &#043; Grafana \u76d1\u63a7\u63a8\u7406\u5ef6\u8fdf\u3001\u663e\u5b58\u5360\u7528\u3001QPS<\/li>\n<li>\u65e5\u5fd7\u7ba1\u7406&#xff1a;\u7ed3\u6784\u5316\u65e5\u5fd7\u8bb0\u5f55\u6240\u6709\u8bf7\u6c42\u3001\u54cd\u5e94\u3001\u9519\u8bef\u4fe1\u606f<\/li>\n<li>\u6a21\u578b\u76d1\u63a7&#xff1a;\u5b9a\u671f\u9a8c\u8bc1\u5d4c\u5165\u8d28\u91cf&#xff0c;\u9632\u6b62\u6a21\u578b\u6f02\u79fb<\/li>\n<h4>6.3 \u6269\u5c55\u6027\u8bbe\u8ba1<\/h4>\n<li>\u5206\u5e03\u5f0f\u90e8\u7f72&#xff1a;\u591a\u8282\u70b9\u8d1f\u8f7d\u5747\u8861&#xff0c;\u652f\u6301\u6c34\u5e73\u6269\u5c55<\/li>\n<li>\u6a21\u578b\u7248\u672c\u7ba1\u7406&#xff1a;\u652f\u6301\u591a\u7248\u672c\u6a21\u578b\u5e76\u884c\u90e8\u7f72&#xff0c;\u7070\u5ea6\u53d1\u5e03<\/li>\n<li>\u5bb9\u9519\u673a\u5236&#xff1a;\u6a21\u578b\u63a8\u7406\u5931\u8d25\u65f6\u81ea\u52a8\u964d\u7ea7\u5230\u5907\u7528\u6a21\u578b \/ API<\/li>\n<h3>\u4e03\u3001\u603b\u7ed3<\/h3>\n<p>Gemini \u901a\u8fc7\u539f\u751f\u591a\u6a21\u6001\u8bbe\u8ba1\u3001\u7edf\u4e00\u7684\u5e8f\u5217\u5316\u8f93\u5165\u3001\u5171\u4eab\u7684 Transformer \u7f16\u7801\u5668\u3001\u8de8\u6a21\u6001\u6ce8\u610f\u529b\u673a\u5236&#xff0c;\u771f\u6b63\u5b9e\u73b0\u4e86\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u7edf\u4e00\u8868\u793a\u3002\u5176\u6838\u5fc3\u4ef7\u503c\u5728\u4e8e&#xff1a;<\/p>\n<li>\u6253\u7834\u4e86\u6a21\u6001\u95f4\u7684\u8bed\u4e49\u58c1\u5792&#xff0c;\u4f7f\u8de8\u6a21\u6001\u7406\u89e3\u548c\u68c0\u7d22\u6210\u4e3a\u53ef\u80fd<\/li>\n<li>\u7edf\u4e00\u7684\u5411\u91cf\u7a7a\u95f4\u7b80\u5316\u4e86\u591a\u6a21\u6001\u5e94\u7528\u7684\u5f00\u53d1\u6d41\u7a0b<\/li>\n<li>\u7aef\u5230\u7aef\u7684\u67b6\u6784\u63d0\u5347\u4e86\u591a\u6a21\u6001\u5904\u7406\u7684\u6548\u7387\u548c\u7cbe\u5ea6<\/li>\n<p>\u901a\u8fc7\u672c\u6587\u7684\u5b9e\u6218\u4ee3\u7801&#xff0c;\u5f00\u53d1\u8005\u53ef\u4ee5\u5feb\u901f\u642d\u5efa\u57fa\u4e8e Gemini \u7684\u591a\u6a21\u6001\u8868\u793a\u7cfb\u7edf&#xff0c;\u5e76\u901a\u8fc7\u4f18\u5316\u548c\u90e8\u7f72\u6280\u5de7&#xff0c;\u5c06\u5176\u843d\u5730\u4e3a\u751f\u4ea7\u7ea7\u5e94\u7528\u3002\u672a\u6765&#xff0c;\u968f\u7740 Gemini \u6a21\u578b\u7684\u6301\u7eed\u8fed\u4ee3&#xff0c;\u591a\u6a21\u6001\u8868\u793a\u7684\u7edf\u4e00\u7a0b\u5ea6\u548c\u5e94\u7528\u573a\u666f\u8fd8\u5c06\u8fdb\u4e00\u6b65\u6269\u5c55\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a&#xff1f;<br \/>\n\u6587\u6863\u6982\u8ff0<br \/>\n\u672c\u6587\u6838\u5fc3\u4ef7\u503c<br \/>\n\u6df1\u5ea6\u62c6\u89e3 Gemini \u591a\u6a21\u6001\u67b6\u6784\u7684\u6838\u5fc3\u8bbe\u8ba1\u903b\u8f91&#xff0c;\u89e3\u91ca\u5176\u5982\u4f55\u5b9e\u73b0\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u7edf\u4e00\u8868\u793a\u638c\u63e1 Gemini \u591a\u6a21\u6001\u6a21\u578b\u7684\u672c\u5730\u90e8\u7f72\u4e0e\u73af\u5883\u914d\u7f6e\u65b9\u6cd5\u901a\u8fc7\u53ef\u8fd0\u884c\u7684\u4ee3\u7801\u5b9e\u6218&#xff0c;\u5b9e\u73b0\u6587\u672c \/ \u56fe\u50cf \/ \u89c6\u9891\u7684\u7edf\u4e00\u5411\u91cf\u8868\u793a\u4e0e\u8de8\u6a21\u6001\u68c0\u7d22\u7406\u89e3\u591a\u6a21\u6001\u8868\u793a\u7edf\u4e00\u7684\u8c03\u8bd5\u3001\u4f18\u5316\u65b9\u6cd5\u4e0e\u751f\u4ea7\u7ea7\u90e8\u7f72\u6280\u5de7<br \/>\n\u5b66\u4e60\u76ee\u6807<br \/>\n\u7406\u89e3 Gemini \u591a\u6a21\u6001\u67b6\u6784\u7684\u6838\u5fc3\u7ec4\u4ef6\u4e0e\u8de8\u6a21\u6001\u5bf9\u9f50\u539f\u7406\u638c<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[50,51,214],"topic":[],"class_list":["post-69072","post","type-post","status-publish","format-standard","hentry","category-server","tag-50","tag-51","tag-214"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v20.3 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a\uff1f - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.wsisp.com\/helps\/69072.html\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a\uff1f - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"og:description\" content=\"Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a&#xff1f; \u6587\u6863\u6982\u8ff0 \u672c\u6587\u6838\u5fc3\u4ef7\u503c \u6df1\u5ea6\u62c6\u89e3 Gemini \u591a\u6a21\u6001\u67b6\u6784\u7684\u6838\u5fc3\u8bbe\u8ba1\u903b\u8f91&#xff0c;\u89e3\u91ca\u5176\u5982\u4f55\u5b9e\u73b0\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u7edf\u4e00\u8868\u793a\u638c\u63e1 Gemini \u591a\u6a21\u6001\u6a21\u578b\u7684\u672c\u5730\u90e8\u7f72\u4e0e\u73af\u5883\u914d\u7f6e\u65b9\u6cd5\u901a\u8fc7\u53ef\u8fd0\u884c\u7684\u4ee3\u7801\u5b9e\u6218&#xff0c;\u5b9e\u73b0\u6587\u672c \/ \u56fe\u50cf \/ \u89c6\u9891\u7684\u7edf\u4e00\u5411\u91cf\u8868\u793a\u4e0e\u8de8\u6a21\u6001\u68c0\u7d22\u7406\u89e3\u591a\u6a21\u6001\u8868\u793a\u7edf\u4e00\u7684\u8c03\u8bd5\u3001\u4f18\u5316\u65b9\u6cd5\u4e0e\u751f\u4ea7\u7ea7\u90e8\u7f72\u6280\u5de7 \u5b66\u4e60\u76ee\u6807 \u7406\u89e3 Gemini \u591a\u6a21\u6001\u67b6\u6784\u7684\u6838\u5fc3\u7ec4\u4ef6\u4e0e\u8de8\u6a21\u6001\u5bf9\u9f50\u539f\u7406\u638c\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.wsisp.com\/helps\/69072.html\" \/>\n<meta property=\"og:site_name\" content=\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"article:published_time\" content=\"2026-01-30T19:57:37+00:00\" \/>\n<meta name=\"author\" content=\"admin\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"admin\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"20 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/69072.html\",\"url\":\"https:\/\/www.wsisp.com\/helps\/69072.html\",\"name\":\"Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a\uff1f - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"isPartOf\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\"},\"datePublished\":\"2026-01-30T19:57:37+00:00\",\"dateModified\":\"2026-01-30T19:57:37+00:00\",\"author\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\"},\"breadcrumb\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/69072.html#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.wsisp.com\/helps\/69072.html\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/69072.html#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/www.wsisp.com\/helps\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a\uff1f\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\",\"url\":\"https:\/\/www.wsisp.com\/helps\/\",\"name\":\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"description\":\"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\",\"name\":\"admin\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"contentUrl\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"caption\":\"admin\"},\"sameAs\":[\"http:\/\/wp.wsisp.com\"],\"url\":\"https:\/\/www.wsisp.com\/helps\/author\/admin\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a\uff1f - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.wsisp.com\/helps\/69072.html","og_locale":"zh_CN","og_type":"article","og_title":"Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a\uff1f - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","og_description":"Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a&#xff1f; \u6587\u6863\u6982\u8ff0 \u672c\u6587\u6838\u5fc3\u4ef7\u503c \u6df1\u5ea6\u62c6\u89e3 Gemini \u591a\u6a21\u6001\u67b6\u6784\u7684\u6838\u5fc3\u8bbe\u8ba1\u903b\u8f91&#xff0c;\u89e3\u91ca\u5176\u5982\u4f55\u5b9e\u73b0\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u7edf\u4e00\u8868\u793a\u638c\u63e1 Gemini \u591a\u6a21\u6001\u6a21\u578b\u7684\u672c\u5730\u90e8\u7f72\u4e0e\u73af\u5883\u914d\u7f6e\u65b9\u6cd5\u901a\u8fc7\u53ef\u8fd0\u884c\u7684\u4ee3\u7801\u5b9e\u6218&#xff0c;\u5b9e\u73b0\u6587\u672c \/ \u56fe\u50cf \/ \u89c6\u9891\u7684\u7edf\u4e00\u5411\u91cf\u8868\u793a\u4e0e\u8de8\u6a21\u6001\u68c0\u7d22\u7406\u89e3\u591a\u6a21\u6001\u8868\u793a\u7edf\u4e00\u7684\u8c03\u8bd5\u3001\u4f18\u5316\u65b9\u6cd5\u4e0e\u751f\u4ea7\u7ea7\u90e8\u7f72\u6280\u5de7 \u5b66\u4e60\u76ee\u6807 \u7406\u89e3 Gemini \u591a\u6a21\u6001\u67b6\u6784\u7684\u6838\u5fc3\u7ec4\u4ef6\u4e0e\u8de8\u6a21\u6001\u5bf9\u9f50\u539f\u7406\u638c","og_url":"https:\/\/www.wsisp.com\/helps\/69072.html","og_site_name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","article_published_time":"2026-01-30T19:57:37+00:00","author":"admin","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"admin","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"20 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.wsisp.com\/helps\/69072.html","url":"https:\/\/www.wsisp.com\/helps\/69072.html","name":"Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a\uff1f - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","isPartOf":{"@id":"https:\/\/www.wsisp.com\/helps\/#website"},"datePublished":"2026-01-30T19:57:37+00:00","dateModified":"2026-01-30T19:57:37+00:00","author":{"@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41"},"breadcrumb":{"@id":"https:\/\/www.wsisp.com\/helps\/69072.html#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.wsisp.com\/helps\/69072.html"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.wsisp.com\/helps\/69072.html#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/www.wsisp.com\/helps"},{"@type":"ListItem","position":2,"name":"Gemini \u7684\u591a\u6a21\u6001\u67b6\u6784\u8bbe\u8ba1\u5982\u4f55\u7edf\u4e00\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8868\u793a\uff1f"}]},{"@type":"WebSite","@id":"https:\/\/www.wsisp.com\/helps\/#website","url":"https:\/\/www.wsisp.com\/helps\/","name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","description":"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41","name":"admin","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/","url":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","contentUrl":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","caption":"admin"},"sameAs":["http:\/\/wp.wsisp.com"],"url":"https:\/\/www.wsisp.com\/helps\/author\/admin"}]}},"_links":{"self":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/69072","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/comments?post=69072"}],"version-history":[{"count":0,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/69072\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media?parent=69072"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/categories?post=69072"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/tags?post=69072"},{"taxonomy":"topic","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/topic?post=69072"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}