{"id":52962,"date":"2025-08-11T22:09:03","date_gmt":"2025-08-11T14:09:03","guid":{"rendered":"https:\/\/www.wsisp.com\/helps\/52962.html"},"modified":"2025-08-11T22:09:03","modified_gmt":"2025-08-11T14:09:03","slug":"%e6%9c%ba%e5%99%a8%e5%ad%a6%e4%b9%a0tf-idf%e7%ae%97%e6%b3%95%e8%af%a6%e8%a7%a3","status":"publish","type":"post","link":"https:\/\/www.wsisp.com\/helps\/52962.html","title":{"rendered":"\u673a\u5668\u5b66\u4e60TF-IDF\u7b97\u6cd5\u8be6\u89e3"},"content":{"rendered":"<\/p>\n<h4>\u6587\u7ae0\u76ee\u5f55<\/h4>\n<ul>\n<li>\u4e00\u3001 \u4ec0\u4e48\u662fTF-IDF&#xff1f;<\/li>\n<li>\n<ul>\n<li>1.\u516c\u5f0f\u8be6\u89e3<\/li>\n<li>2.\u5b9e\u4f8b\u8be6\u89e3<\/li>\n<\/ul>\n<\/li>\n<li>\u4e8c\u3001 \u4e3a\u4ec0\u4e48\u4f7f\u7528TF-IDF&#xff1f;<\/li>\n<li>\u4e09\u3001 Python API \u5b9e\u8df5&#xff1a;&#096;scikit-learn&#096;<\/li>\n<li>\n<ul>\n<li>1.&#096;TfidfVectorizer&#096; \u4e3b\u8981\u53c2\u6570<\/li>\n<li>2.\u6838\u5fc3\u65b9\u6cd5<\/li>\n<\/ul>\n<\/li>\n<li>\u56db\u3001 \u5b9e\u6218\u4f8b\u5b50&#xff1a;\u4ece\u6587\u4ef6\u8bfb\u53d6\u5e76\u5206\u6790\u5173\u952e\u8bcd<\/li>\n<li>\n<ul>\n<li>1.\u6587\u4ef6\u6570\u636e<\/li>\n<li>2..\u5b8c\u6574\u4ee3\u7801<\/li>\n<li>3.\u4ee3\u7801\u8be6\u89e3<\/li>\n<li>4.\u8f93\u51fa\u793a\u4f8b<\/li>\n<li>5.\u6539\u8fdb\u5efa\u8bae<\/li>\n<\/ul>\n<\/li>\n<li>\u4e94\u3001 \u603b\u7ed3\u4e0e\u5c55\u671b<\/li>\n<\/ul>\n<p> \u5728\u81ea\u7136\u8bed\u8a00\u5904\u7406&#xff08;NLP&#xff09;\u548c\u4fe1\u606f\u68c0\u7d22\u9886\u57df&#xff0c;\u5982\u4f55\u5c06\u4eba\u7c7b\u8bed\u8a00&#xff08;\u6587\u672c&#xff09;\u8f6c\u5316\u4e3a\u8ba1\u7b97\u673a\u53ef\u4ee5\u7406\u89e3\u548c\u5904\u7406\u7684\u6570\u503c\u5f62\u5f0f&#xff0c;\u662f\u81f3\u5173\u91cd\u8981\u7684\u7b2c\u4e00\u6b65\u3002<br \/>\nTF-IDF&#xff08;Term Frequency-Inverse Document Frequency&#xff09;\u5c31\u662f\u8fd9\u6837\u4e00\u4e2a\u7ecf\u5178\u4e14\u5f3a\u5927\u7684\u6280\u672f&#xff0c;\u5b83\u80fd\u591f\u8861\u91cf\u4e00\u4e2a\u8bcd\u5728\u6587\u6863\u96c6\u5408&#xff08;\u8bed\u6599\u5e93&#xff09;\u4e2d\u67d0\u4e2a\u6587\u6863\u91cc\u7684<br \/>\n\u91cd\u8981\u7a0b\u5ea6\u3002\u672c\u6587\u5c06\u5e26\u4f60\u6df1\u5165\u7406\u89e3TF-IDF\u7684\u539f\u7406&#xff0c;\u4ecb\u7ecd\u5e38\u7528\u7684Python API&#xff0c;\u5e76\u901a\u8fc7\u4e00\u4e2a\u57fa\u4e8e\u771f\u5b9e\u6587\u4ef6\u8bfb\u53d6\u7684\u5b8c\u6574\u5b9e\u6218\u4f8b\u5b50\u8fdb\u884c\u6f14\u793a\u3002<\/p>\n<hr \/>\n<h2>\u4e00\u3001 \u4ec0\u4e48\u662fTF-IDF&#xff1f;<\/h2>\n<p>TF-IDF\u662f\u4e00\u79cd\u7edf\u8ba1\u65b9\u6cd5&#xff0c;\u7528\u4e8e\u8bc4\u4f30\u4e00\u4e2a\u8bcd\u5728\u6587\u6863\u96c6\u5408&#xff08;\u8bed\u6599\u5e93&#xff09;\u4e2d\u67d0\u4e2a\u6587\u6863\u91cc\u7684\u91cd\u8981\u7a0b\u5ea6\u3002\u5176\u6838\u5fc3\u601d\u60f3\u662f&#xff1a;<\/p>\n<li>\u8bcd\u9891 (Term Frequency, TF)&#xff1a;\u4e00\u4e2a\u8bcd\u5728\u6587\u6863\u4e2d\u51fa\u73b0\u7684\u6b21\u6570\u8d8a\u591a&#xff0c;\u5b83\u5bf9\u8fd9\u4e2a\u6587\u6863\u7684\u91cd\u8981\u6027\u53ef\u80fd\u5c31\u8d8a\u9ad8\u3002<\/li>\n<li>\u9006\u6587\u6863\u9891\u7387 (Inverse Document Frequency, IDF)&#xff1a;\u4e00\u4e2a\u8bcd\u5728\u6240\u6709\u6587\u6863\u4e2d\u51fa\u73b0\u7684\u6587\u6863\u6570\u8d8a\u5c11&#xff08;\u5373\u8d8a\u7a00\u6709&#xff09;&#xff0c;\u5b83\u533a\u5206\u6587\u6863\u7684\u80fd\u529b\u5c31\u8d8a\u5f3a&#xff0c;\u91cd\u8981\u6027\u5c31\u8d8a\u9ad8\u3002<\/li>\n<p>\u7b80\u5355\u6765\u8bf4&#xff0c;TF-IDF\u503c\u9ad8\u7684\u8bcd&#xff0c;\u662f\u90a3\u4e9b\u5728\u5f53\u524d\u6587\u6863\u4e2d\u9891\u7e41\u51fa\u73b0&#xff0c;\u4f46\u5728\u6574\u4e2a\u8bed\u6599\u5e93\u7684\u5176\u4ed6\u6587\u6863\u4e2d\u5f88\u5c11\u51fa\u73b0\u7684\u8bcd\u3002\u8fd9\u4e9b\u8bcd\u5f80\u5f80\u6700\u80fd\u4ee3\u8868\u8be5\u6587\u6863\u7684\u72ec\u7279\u4e3b\u9898\u3002<\/p>\n<h3>1.\u516c\u5f0f\u8be6\u89e3<\/h3>\n<ul>\n<li>TF (\u8bcd\u9891)&#xff1a;\n<ul>\n<li>\u6700\u7b80\u5355\u7684\u5f62\u5f0f&#xff1a;TF(t, d) &#061; \u8bcdt\u5728\u6587\u6863d\u4e2d\u51fa\u73b0\u7684\u6b21\u6570 \/ \u6587\u6863d\u7684\u603b\u8bcd\u6570<\/li>\n<li>\u8fd8\u6709\u5176\u4ed6\u53d8\u4f53&#xff0c;\u5982\u5bf9\u6570\u7f29\u653e&#xff1a;TF(t, d) &#061; 1 &#043; log(\u8bcdt\u5728\u6587\u6863d\u4e2d\u51fa\u73b0\u7684\u6b21\u6570)&#xff0c;\u4ee5\u907f\u514d\u957f\u6587\u6863\u4e2d\u8bcd\u9891\u8fc7\u9ad8\u3002<\/li>\n<\/ul>\n<\/li>\n<li>IDF (\u9006\u6587\u6863\u9891\u7387)&#xff1a;\n<ul>\n<li>IDF(t) &#061; log(\u8bed\u6599\u5e93\u4e2d\u6587\u6863\u603b\u6570 \/ \u5305\u542b\u8bcdt\u7684\u6587\u6863\u6570)<\/li>\n<li>\u8fd9\u91cc\u7684 log \u901a\u5e38\u4ee5\u81ea\u7136\u5bf9\u6570&#xff08;e&#xff09;\u6216\u4ee510\u4e3a\u5e95&#xff0c;\u5177\u4f53\u5b9e\u73b0\u53ef\u80fd\u4e0d\u540c\u3002\u5206\u6bcd\u52a01&#xff08;\u6216\u5206\u5b50\u52a01&#xff09;\u662f\u5e38\u89c1\u7684\u5e73\u6ed1\u5904\u7406&#xff0c;\u9632\u6b62\u5206\u6bcd\u4e3a0\u3002<\/li>\n<\/ul>\n<\/li>\n<li>TF-IDF&#xff1a;\n<ul>\n<li>TF-IDF(t, d) &#061; TF(t, d) * IDF(t)<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h3>2.\u5b9e\u4f8b\u8be6\u89e3<\/h3>\n<p>\u5047\u5b9a\u6709\u4e00\u7bc7\u6587\u7ae0\u300a\u4e2d\u56fd\u7684\u871c\u8702\u517b\u6b96\u300b&#xff0c;\u8be5\u6587\u957f\u5ea6\u4e3a1000\u4e2a\u8bcd&#xff0c;\u201c\u4e2d\u56fd\u201d\u3001\u201c\u871c\u8702\u201d\u3001\u201c\u517b\u6b96\u201d\u5404\u51fa\u73b020\u6b21&#xff0c;\u5219\u8fd9\u4e09\u4e2a\u8bcd\u7684\u201c\u8bcd\u9891\u201d&#xff08;TF&#xff09;\u90fd\u4e3a0.02\u3002\u7136\u540e&#xff0c;\u641c\u7d22Google\u53d1\u73b0&#xff0c;\u5305\u542b\u201c\u7684\u201d\u5b57\u7684\u7f51\u9875\u5171\u6709250\u4ebf\u5f20&#xff0c;\u5047\u5b9a\u8fd9\u5c31\u662f\u4e2d\u6587\u7f51\u9875\u603b\u6570\u3002\u5305\u542b\u201c\u4e2d\u56fd\u201d\u7684\u7f51\u9875\u5171\u670962.3\u4ebf\u5f20&#xff0c;\u5305\u542b\u201c\u871c\u8702\u201d\u7684\u7f51\u9875\u4e3a0.484\u4ebf\u5f20&#xff0c;\u5305\u542b\u201c\u517b\u6b96\u201d\u7684\u7f51\u9875\u4e3a0.973\u4ebf\u5f20\u3002\u5219\u5b83\u4eec\u7684\u9006\u6587\u6863\u9891\u7387&#xff08;IDF&#xff09;\u548cTF-IDF\u5982\u4e0b&#xff1a;<\/p>\n<table>\n<tr>\u8bcd\u5305\u542b\u8be5\u8bcd\u7684\u6587\u6863\u6570&#xff08;\u4ebf&#xff09;IDFTF-IDF<\/tr>\n<tbody>\n<tr>\n<td>\u4e2d\u56fd<\/td>\n<td>62.3<\/td>\n<td>0.603<\/td>\n<td>0.0121<\/td>\n<\/tr>\n<tr>\n<td>\u871c\u8702<\/td>\n<td>0.484<\/td>\n<td>2.713<\/td>\n<td>0.0543<\/td>\n<\/tr>\n<tr>\n<td>\u517b\u6b96<\/td>\n<td>0.973<\/td>\n<td>2.410<\/td>\n<td>0.0482<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u4ece\u4e0a\u8868\u53ef\u89c1&#xff0c;\u201c\u871c\u8702\u201d\u7684TF-IDF\u503c\u6700\u9ad8&#xff0c;\u201c\u517b\u6b96\u201d\u5176\u6b21&#xff0c;\u201c\u4e2d\u56fd\u201d\u6700\u4f4e\u3002\u6240\u4ee5&#xff0c;\u5982\u679c\u53ea\u9009\u62e9\u4e00\u4e2a\u8bcd&#xff0c;\u201c\u871c\u8702\u201d\u5c31\u662f\u8fd9\u7bc7\u6587\u7ae0\u7684\u5173\u952e\u8bcd\u3002<\/p>\n<p>\u7ed3\u679c\u89e3\u91ca&#xff1a;<\/p>\n<ul>\n<li>\u9ad8TF-IDF\u503c&#xff1a;\u8be5\u8bcd\u5728\u5f53\u524d\u6587\u6863\u4e2d\u5f88\u5e38\u89c1&#xff0c;\u4f46\u5728\u8bed\u6599\u5e93\u7684\u5176\u4ed6\u6587\u6863\u4e2d\u4e0d\u5e38\u89c1\u3002\u8fd9\u901a\u5e38\u662f\u4e00\u4e2a\u5173\u952e\u8bcd\u3002<\/li>\n<li>\u4f4eTF-IDF\u503c&#xff1a;\u8be5\u8bcd\u8981\u4e48\u5728\u5f53\u524d\u6587\u6863\u4e2d\u4e0d\u5e38\u89c1&#xff08;\u4f4eTF&#xff09;&#xff0c;\u8981\u4e48\u5728\u5f88\u591a\u6587\u6863\u4e2d\u90fd\u5f88\u5e38\u89c1&#xff08;\u4f4eIDF&#xff0c;\u5982\u201c\u7684\u201d\u3001\u201c\u662f\u201d\u3001\u201c\u5728\u201d\u7b49\u505c\u7528\u8bcd&#xff09;\u3002\u8fd9\u7c7b\u8bcd\u901a\u5e38\u4e0d\u91cd\u8981\u3002<\/li>\n<\/ul>\n<hr \/>\n<h2>\u4e8c\u3001 \u4e3a\u4ec0\u4e48\u4f7f\u7528TF-IDF&#xff1f;<\/h2>\n<ul>\n<li>\u964d\u7ef4\u4e0e\u7279\u5f81\u63d0\u53d6&#xff1a;\u5c06\u6587\u672c\u8f6c\u6362\u4e3a\u6570\u503c\u5411\u91cf&#xff0c;\u4fbf\u4e8e\u673a\u5668\u5b66\u4e60\u6a21\u578b\u5904\u7406\u3002<\/li>\n<li>\u5173\u952e\u8bcd\u63d0\u53d6&#xff1a;\u627e\u51fa\u6700\u80fd\u4ee3\u8868\u6587\u6863\u4e3b\u9898\u7684\u8bcd\u6c47\u3002<\/li>\n<li>\u6587\u6863\u76f8\u4f3c\u5ea6\u8ba1\u7b97&#xff1a;\u901a\u8fc7\u6bd4\u8f83\u6587\u6863\u7684TF-IDF\u5411\u91cf&#xff08;\u5982\u8ba1\u7b97\u4f59\u5f26\u76f8\u4f3c\u5ea6&#xff09;\u6765\u5224\u65ad\u5b83\u4eec\u7684\u76f8\u4f3c\u6027\u3002<\/li>\n<li>\u4fe1\u606f\u68c0\u7d22&#xff1a;\u641c\u7d22\u5f15\u64ce\u7528\u6765\u8bc4\u4f30\u6587\u6863\u4e0e\u67e5\u8be2\u7684\u76f8\u5173\u6027\u3002<\/li>\n<li>\u6587\u672c\u5206\u7c7b&#xff1a;\u4f5c\u4e3a\u5206\u7c7b\u6a21\u578b\u7684\u8f93\u5165\u7279\u5f81\u3002<\/li>\n<\/ul>\n<hr \/>\n<h2>\u4e09\u3001 Python API \u5b9e\u8df5&#xff1a;scikit-learn<\/h2>\n<p>Python\u4e2d\u6700\u5e38\u7528\u4e14\u6700\u65b9\u4fbf\u5b9e\u73b0TF-IDF\u7684\u5e93\u662f scikit-learn\u3002\u5176\u6838\u5fc3\u7c7b\u662f TfidfVectorizer\u3002<\/p>\n<h3>1.TfidfVectorizer \u4e3b\u8981\u53c2\u6570<\/h3>\n<ul>\n<li>max_features&#xff1a;\u4fdd\u7559TF-IDF\u503c\u6700\u9ad8\u7684\u524dN\u4e2a\u8bcd\u4f5c\u4e3a\u7279\u5f81\u3002<\/li>\n<li>stop_words&#xff1a;\u6307\u5b9a\u505c\u7528\u8bcd\u5217\u8868&#xff08;\u5982\u82f1\u6587\u7684&#039;english&#039;&#xff09;&#xff0c;\u8fd9\u4e9b\u8bcd\u4f1a\u88ab\u5ffd\u7565\u3002<\/li>\n<li>ngram_range&#xff1a;\u8003\u8651\u8bcd\u7ec4&#xff08;n-grams&#xff09;&#xff0c;\u4f8b\u5982 (1, 2) \u8868\u793a\u540c\u65f6\u8003\u8651\u5355\u4e2a\u8bcd\u548c\u4e8c\u5143\u8bcd\u7ec4\u3002<\/li>\n<li>min_df \/ max_df&#xff1a;\u8fc7\u6ee4\u6389\u5728\u5c11\u4e8emin_df\u4e2a\u6587\u6863\u6216\u8d85\u8fc7max_df\u6bd4\u4f8b\u6587\u6863\u4e2d\u51fa\u73b0\u7684\u8bcd\u3002<\/li>\n<li>sublinear_tf&#xff1a;\u5982\u679c\u4e3aTrue&#xff0c;\u5219\u4f7f\u75281 &#043; log(TF)\u4f5c\u4e3a\u8bcd\u9891&#xff0c;\u6709\u52a9\u4e8e\u6291\u5236\u9ad8\u9891\u8bcd\u7684\u5f71\u54cd\u3002<\/li>\n<li>norm&#xff1a;\u5411\u91cf\u7684\u5f52\u4e00\u5316\u65b9\u5f0f&#xff0c;\u901a\u5e38\u662f&#039;l2&#039;\u3002<\/li>\n<\/ul>\n<h3>2.\u6838\u5fc3\u65b9\u6cd5<\/h3>\n<ul>\n<li>fit(documents)&#xff1a;\u5b66\u4e60\u8bcd\u6c47\u8868\u548cIDF\u503c\u3002<\/li>\n<li>transform(documents)&#xff1a;\u5c06\u6587\u6863\u8f6c\u6362\u4e3aTF-IDF\u77e9\u9635\u3002<\/li>\n<li>fit_transform(documents)&#xff1a;\u4e00\u6b65\u5b8c\u6210\u5b66\u4e60\u548c\u8f6c\u6362\u3002<\/li>\n<li>get_feature_names_out()&#xff1a;\u83b7\u53d6\u7279\u5f81&#xff08;\u8bcd&#xff09;\u7684\u540d\u79f0\u5217\u8868\u3002<\/li>\n<li>get_idf()&#xff1a;\u83b7\u53d6\u6bcf\u4e2a\u8bcd\u7684IDF\u503c&#xff08;\u9700\u8981\u5148\u8c03\u7528fit&#xff09;\u3002<\/li>\n<\/ul>\n<hr \/>\n<h2>\u56db\u3001 \u5b9e\u6218\u4f8b\u5b50&#xff1a;\u4ece\u6587\u4ef6\u8bfb\u53d6\u5e76\u5206\u6790\u5173\u952e\u8bcd<\/h2>\n<p>\u73b0\u5728&#xff0c;\u6211\u4eec\u6765\u770b\u4e00\u4e2a\u66f4\u8d34\u8fd1\u5b9e\u9645\u5e94\u7528\u573a\u666f\u7684\u4f8b\u5b50\u3002\u6211\u4eec\u5c06\u4ece\u4e00\u4e2a\u540d\u4e3a task2_1.txt \u7684\u6587\u672c\u6587\u4ef6\u4e2d\u8bfb\u53d6\u591a\u884c\u6587\u672c&#xff08;\u6bcf\u884c\u89c6\u4e3a\u4e00\u4e2a\u72ec\u7acb\u7684\u6587\u6863&#xff09;&#xff0c;\u7136\u540e\u4f7f\u7528TF-IDF\u5206\u6790\u5e76\u63d0\u53d6\u6700\u91cd\u8981\u7684\u5173\u952e\u8bcd\u3002<\/p>\n<h3>1.\u6587\u4ef6\u6570\u636e<\/h3>\n<p>This <span class=\"token keyword\">is<\/span> the first document<br \/>\nThis document <span class=\"token keyword\">is<\/span> the second document<br \/>\nAnd this <span class=\"token keyword\">is<\/span> the third one<br \/>\nIs this the first document<br \/>\nThis line has several words<br \/>\nThis <span class=\"token keyword\">is<\/span> the final document<\/p>\n<h3>2\u2026\u5b8c\u6574\u4ee3\u7801<\/h3>\n<p>from sklearn.feature_extraction.text import TfidfVectorizer<br \/>\nimport pandas as pd<\/p>\n<p># 1. \u8bfb\u53d6\u6587\u672c\u6587\u4ef6\u5185\u5bb9<br \/>\n# \u5047\u8bbe task2_1.txt \u6bcf\u884c\u662f\u4e00\u4e2a\u6587\u6863<br \/>\ninFile &#061; open(r&#039;task2_1.txt&#039;, &#039;r&#039;, encoding&#061;&#039;utf-8&#039;)  # \u5efa\u8bae\u6307\u5b9a\u7f16\u7801<br \/>\ncorpus &#061; inFile.readlines()<br \/>\ninFile.close()  # \u5173\u95ed\u6587\u4ef6\u91ca\u653e\u8d44\u6e90<br \/>\n# \u6216\u8005\u4f7f\u7528\u66f4\u5b89\u5168\u7684 with \u8bed\u53e5&#xff1a;<br \/>\n# with open(r&#039;task2_1.txt&#039;, &#039;r&#039;, encoding&#061;&#039;utf-8&#039;) as f:<br \/>\n#     corpus &#061; f.readlines()<\/p>\n<p># 2. \u521d\u59cb\u5316TF-IDF\u5411\u91cf\u5668\u5e76\u8ba1\u7b97\u77e9\u9635<br \/>\n# \u4f7f\u7528\u9ed8\u8ba4\u53c2\u6570&#xff0c;\u53ef\u6839\u636e\u9700\u8981\u6dfb\u52a0 stop_words, max_features \u7b49<br \/>\nvectorizer &#061; TfidfVectorizer()<br \/>\ntfidf_matrix &#061; vectorizer.fit_transform(corpus)<\/p>\n<p># 3. \u83b7\u53d6\u8bcd\u6c47\u5217\u8868&#xff08;\u517c\u5bb9\u65b0\u7248\u672csklearn&#xff09;<br \/>\nwordlist &#061; vectorizer.get_feature_names_out()<\/p>\n<p># 4. \u8f6c\u6362\u4e3aDataFrame&#xff08;\u884c&#xff1a;\u8bcd\u6c47&#xff0c;\u5217&#xff1a;\u6587\u6863&#xff09;<br \/>\n# .T \u8868\u793a\u8f6c\u7f6e&#xff0c;\u4f7f\u5f97\u8bcd\u6c47\u6210\u4e3a\u884c\u7d22\u5f15&#xff0c;\u6587\u6863\u6210\u4e3a\u5217<br \/>\ndf &#061; pd.DataFrame(tfidf_matrix.T.todense(), index&#061;wordlist)<\/p>\n<p># 5. \u8ba1\u7b97\u6bcf\u4e2a\u8bcd\u6c47\u7684\u5e73\u5747TF-IDF\u503c&#xff08;\u4f5c\u4e3a\u91cd\u8981\u6027\u6307\u6807&#xff09;<br \/>\n# axis&#061;1 \u8868\u793a\u5bf9\u6bcf\u4e00\u884c&#xff08;\u5373\u6bcf\u4e2a\u8bcd\u5728\u6240\u6709\u6587\u6863\u4e2d\u7684TF-IDF\u503c&#xff09;\u6c42\u5e73\u5747<br \/>\ndf[&#039;mean_tfidf&#039;] &#061; df.mean(axis&#061;1)<\/p>\n<p># 6. \u6309\u5e73\u5747TF-IDF\u503c\u964d\u5e8f\u6392\u5e8f<br \/>\n# ascending&#061;False \u8868\u793a\u964d\u5e8f\u6392\u5217&#xff0c;\u9ad8\u5206\u5728\u524d<br \/>\nsorted_words &#061; df.sort_values(by&#061;&#039;mean_tfidf&#039;, ascending&#061;False)<\/p>\n<p># 7. \u6253\u5370\u5b8c\u6574\u6392\u5e8f\u7ed3\u679c&#xff08;\u6240\u6709\u8bcd\u6c47\u53ca\u5176\u5e73\u5747TF-IDF\u503c&#xff09;<br \/>\nprint(&#034;\u6240\u6709\u5173\u952e\u8bcd\u6309\u5e73\u5747TF-IDF\u503c\u6392\u5e8f&#xff08;\u964d\u5e8f&#xff09;&#xff1a;&#034;)<br \/>\nprint(sorted_words[[&#039;mean_tfidf&#039;]])  # \u53ea\u663e\u793a\u5e73\u5747TF-IDF\u503c\u5217<\/p>\n<p># 8. \u6253\u5370\u524d\u4e94\u540d\u5173\u952e\u8bcd<br \/>\nprint(&#034;\\\\n\u6392\u540d\u524d\u4e94\u7684\u5173\u952e\u8bcd&#xff1a;&#034;)<br \/>\ntop5_words &#061; sorted_words.head(5)<br \/>\nprint(top5_words.index.tolist())  # \u53ea\u663e\u793a\u8bcd\u6c47\u540d\u79f0<\/p>\n<h3>3.\u4ee3\u7801\u8be6\u89e3<\/h3>\n<li>\u6587\u4ef6\u8bfb\u53d6 (readlines)&#xff1a;readlines() \u65b9\u6cd5\u5c06\u6587\u4ef6\u7684\u6bcf\u4e00\u884c\u8bfb\u53d6\u4e3a\u5217\u8868 corpus \u4e2d\u7684\u4e00\u4e2a\u5143\u7d20\u3002\u6bcf\u884c\u88ab\u89c6\u4e3a\u8bed\u6599\u5e93\u4e2d\u7684\u4e00\u4e2a\u72ec\u7acb\u6587\u6863\u3002<\/li>\n<li>TF-IDF \u8ba1\u7b97&#xff1a;TfidfVectorizer().fit_transform(corpus) \u4f1a\u81ea\u52a8\u5904\u7406\u5206\u8bcd&#xff08;\u57fa\u4e8e\u7a7a\u683c\u548c\u6807\u70b9&#xff09;\u3001\u6784\u5efa\u8bcd\u6c47\u8868\u3001\u8ba1\u7b97\u6bcf\u4e2a\u8bcd\u5728\u6bcf\u4e2a\u6587\u6863\u4e2d\u7684TF-IDF\u503c&#xff0c;\u5e76\u8fd4\u56de\u4e00\u4e2a\u7a00\u758f\u77e9\u9635\u3002<\/li>\n<li>\u6570\u636e\u8f6c\u6362 (tfidf_matrix.T.todense())&#xff1a;\n<ul>\n<li>tfidf_matrix \u7684\u539f\u59cb\u5f62\u72b6\u662f (\u6587\u6863\u6570, \u8bcd\u6c47\u6570)\u3002<\/li>\n<li>.T \u5c06\u5176\u8f6c\u7f6e\u4e3a (\u8bcd\u6c47\u6570, \u6587\u6863\u6570)&#xff0c;\u8fd9\u6837\u6bcf\u4e2a\u8bcd\u6c47\u5c31\u5bf9\u5e94\u4e00\u884c\u3002<\/li>\n<li>.todense() \u5c06\u7a00\u758f\u77e9\u9635\u8f6c\u6362\u4e3a\u666e\u901a\u7684NumPy\u6570\u7ec4&#xff0c;\u4fbf\u4e8ePandas\u5904\u7406&#xff08;\u5bf9\u4e8e\u975e\u5e38\u5927\u7684\u77e9\u9635&#xff0c;\u6b64\u6b65\u9aa4\u53ef\u80fd\u6d88\u8017\u8f83\u591a\u5185\u5b58&#xff0c;\u53ef\u8003\u8651\u76f4\u63a5\u4f7f\u7528\u7a00\u758f\u77e9\u9635\u8ba1\u7b97\u5747\u503c&#xff09;\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\u521b\u5efaDataFrame&#xff1a;\u4ee5 wordlist \u4f5c\u4e3a\u884c\u7d22\u5f15&#xff08;\u8bcd\u6c47&#xff09;&#xff0c;\u6bcf\u4e00\u5217\u4ee3\u8868\u4e00\u4e2a\u6587\u6863\u7684TF-IDF\u503c\u3002<\/li>\n<li>\u8ba1\u7b97\u5e73\u5747TF-IDF&#xff1a;df.mean(axis&#061;1) \u8ba1\u7b97\u6bcf\u4e2a\u8bcd\u5728\u6240\u6709\u6587\u6863\u4e2d\u7684\u5e73\u5747TF-IDF\u503c\u3002\u8fd9\u4e2a\u5e73\u5747\u503c\u53ef\u4ee5\u770b\u4f5c\u662f\u8be5\u8bcd\u5728\u6574\u4e2a\u8bed\u6599\u5e93\u4e2d\u7684\u7efc\u5408\u91cd\u8981\u6027\u3002\u4e00\u4e2a\u8bcd\u5982\u679c\u5728\u591a\u4e2a\u6587\u6863\u4e2d\u90fd\u662f\u5173\u952e\u8bcd&#xff08;\u9ad8TF-IDF&#xff09;&#xff0c;\u5b83\u7684\u5e73\u5747\u503c\u5c31\u4f1a\u5f88\u9ad8\u3002<\/li>\n<li>\u6392\u5e8f\u4e0e\u8f93\u51fa&#xff1a;sort_values(by&#061;&#039;mean_tfidf&#039;, ascending&#061;False) \u6309\u5e73\u5747\u91cd\u8981\u6027\u964d\u5e8f\u6392\u5217\u3002\u6700\u540e\u8f93\u51fa\u5b8c\u6574\u5217\u8868\u548c\u524d\u4e94\u540d\u5173\u952e\u8bcd\u3002<\/li>\n<h3>4.\u8f93\u51fa\u793a\u4f8b<\/h3>\n<p>\u6240\u6709\u5173\u952e\u8bcd\u6309\u5e73\u5747TF-IDF\u503c\u6392\u5e8f&#xff08;\u964d\u5e8f&#xff09;&#xff1a;<br \/>\n          mean_tfidf<br \/>\ndocument    0.330080<br \/>\nis          0.281045<br \/>\nthe         0.281045<br \/>\nthis        0.279621<br \/>\nfirst       0.206838<br \/>\nfinal       0.115731<br \/>\nsecond      0.094208<br \/>\nand         0.086389<br \/>\none         0.086389<br \/>\nthird       0.086389<br \/>\nhas         0.081354<br \/>\nline        0.081354<br \/>\nseveral     0.081354<br \/>\nwords       0.081354<\/p>\n<p>\u6392\u540d\u524d\u4e94\u7684\u5173\u952e\u8bcd&#xff1a;<br \/>\n[&#039;document&#039;, &#039;is&#039;, &#039;the&#039;, &#039;this&#039;, &#039;first&#039;]<\/p>\n<p>\u8fd9\u610f\u5473\u7740 document \u662f\u6574\u4e2a\u6587\u4ef6\u96c6\u5408\u4e2d\u6700\u91cd\u8981\u7684\u5173\u952e\u8bcd\u3002<\/p>\n<h3>5.\u6539\u8fdb\u5efa\u8bae<\/h3>\n<ul>\n<li>\u6587\u4ef6\u7f16\u7801&#xff1a;\u8bfb\u53d6\u6587\u4ef6\u65f6\u6700\u597d\u6307\u5b9a encoding&#061;&#039;utf-8&#039;&#xff0c;\u907f\u514d\u7f16\u7801\u9519\u8bef\u3002<\/li>\n<li>\u8d44\u6e90\u7ba1\u7406&#xff1a;\u4f7f\u7528 with open(&#8230;) as f: \u8bed\u53e5\u53ef\u4ee5\u81ea\u52a8\u7ba1\u7406\u6587\u4ef6\u5173\u95ed\u3002<\/li>\n<li>\u505c\u7528\u8bcd&#xff1a;\u5bf9\u4e8e\u82f1\u6587\u6587\u672c&#xff0c;\u5f3a\u70c8\u5efa\u8bae\u6dfb\u52a0 stop_words&#061;&#039;english&#039; \u6765\u8fc7\u6ee4\u6389 the, is, and \u7b49\u65e0\u610f\u4e49\u7684\u8bcd\u3002<\/li>\n<li>\u5185\u5b58\u4f18\u5316&#xff1a;\u5bf9\u4e8e\u5927\u6587\u4ef6&#xff0c;\u907f\u514d\u4f7f\u7528 .todense()&#xff0c;\u53ef\u4ee5\u76f4\u63a5\u7528 tfidf_matrix.mean(axis&#061;0) \u8ba1\u7b97\u6bcf\u5217&#xff08;\u8bcd\u6c47&#xff09;\u7684\u5747\u503c&#xff0c;\u7136\u540e\u4e0e wordlist \u7ed3\u5408\u6392\u5e8f\u3002<\/li>\n<li>\u9884\u5904\u7406&#xff1a;\u53ef\u4ee5\u81ea\u5b9a\u4e49 token_pattern \u6216\u4f7f\u7528\u66f4\u590d\u6742\u7684\u5206\u8bcd\u5668&#xff08;\u5982jieba\u7528\u4e8e\u4e2d\u6587&#xff09;\u3002<\/li>\n<\/ul>\n<hr \/>\n<h2>\u4e94\u3001 \u603b\u7ed3\u4e0e\u5c55\u671b<\/h2>\n<p>TF-IDF\u662f\u4e00\u4e2a\u7b80\u5355\u4f46\u6781\u5176\u6709\u6548\u7684\u6587\u672c\u8868\u793a\u65b9\u6cd5&#xff0c;\u81f3\u4eca\u4ecd\u5728\u8bb8\u591a\u5e94\u7528\u4e2d\u53d1\u6325\u7740\u91cd\u8981\u4f5c\u7528\u3002\u901a\u8fc7 scikit-learn \u7684 TfidfVectorizer&#xff0c;\u6211\u4eec\u53ef\u4ee5\u975e\u5e38\u65b9\u4fbf\u5730\u5c06\u5176\u5e94\u7528\u4e8e\u5b9e\u9645\u9879\u76ee&#xff0c;\u5982\u672c\u6587\u5c55\u793a\u7684\u4ece\u6587\u4ef6\u8bfb\u53d6\u5e76\u5206\u6790\u5173\u952e\u8bcd\u3002<\/p>\n<p>\u4f18\u70b9&#xff1a;<\/p>\n<ul>\n<li>\u6982\u5ff5\u6e05\u6670&#xff0c;\u6613\u4e8e\u7406\u89e3\u548c\u5b9e\u73b0\u3002<\/li>\n<li>\u6709\u6548\u6291\u5236\u5e38\u89c1\u8bcd&#xff08;\u505c\u7528\u8bcd&#xff09;\u7684\u5f71\u54cd&#xff0c;\u7a81\u51fa\u5173\u952e\u8bcd\u3002<\/li>\n<li>\u8ba1\u7b97\u76f8\u5bf9\u9ad8\u6548\u3002<\/li>\n<\/ul>\n<p>\u5c40\u9650\u6027&#xff1a;<\/p>\n<ul>\n<li>\n<p>\u5ffd\u7565\u8bcd\u5e8f\u548c\u8bed\u4e49&#xff1a;\u5c06\u6587\u672c\u89c6\u4e3a\u201c\u8bcd\u888b\u201d&#xff08;Bag of Words&#xff09;&#xff0c;\u4e22\u5931\u4e86\u8bcd\u5e8f\u548c\u4e0a\u4e0b\u6587\u4fe1\u606f\u3002<\/p>\n<\/li>\n<li>\n<p>\u65e0\u6cd5\u5904\u7406\u540c\u4e49\u8bcd\/\u591a\u4e49\u8bcd&#xff1a;\u4e0d\u540c\u7684\u8bcd\u53ef\u80fd\u6709\u76f8\u540c\u542b\u4e49&#xff08;\u540c\u4e49\u8bcd&#xff09;&#xff0c;\u540c\u4e00\u4e2a\u8bcd\u5728\u4e0d\u540c\u4e0a\u4e0b\u6587\u53ef\u80fd\u6709\u4e0d\u540c\u542b\u4e49&#xff08;\u591a\u4e49\u8bcd&#xff09;\u3002<\/p>\n<\/li>\n<li>\n<p>\u5bf9\u77ed\u6587\u672c\u6548\u679c\u53ef\u80fd\u4e0d\u4f73\u3002<\/p>\n<\/li>\n<li>\n<p>\u6982\u5ff5\u6e05\u6670&#xff0c;\u6613\u4e8e\u7406\u89e3\u548c\u5b9e\u73b0\u3002<\/p>\n<\/li>\n<li>\n<p>\u6709\u6548\u6291\u5236\u5e38\u89c1\u8bcd&#xff08;\u505c\u7528\u8bcd&#xff09;\u7684\u5f71\u54cd&#xff0c;\u7a81\u51fa\u5173\u952e\u8bcd\u3002<\/p>\n<\/li>\n<li>\n<p>\u8ba1\u7b97\u76f8\u5bf9\u9ad8\u6548\u3002<\/p>\n<\/li>\n<\/ul>\n<p>\u5c40\u9650\u6027&#xff1a;<\/p>\n<ul>\n<li>\u5ffd\u7565\u8bcd\u5e8f\u548c\u8bed\u4e49&#xff1a;\u5c06\u6587\u672c\u89c6\u4e3a\u201c\u8bcd\u888b\u201d&#xff08;Bag of Words&#xff09;&#xff0c;\u4e22\u5931\u4e86\u8bcd\u5e8f\u548c\u4e0a\u4e0b\u6587\u4fe1\u606f\u3002<\/li>\n<li>\u65e0\u6cd5\u5904\u7406\u540c\u4e49\u8bcd\/\u591a\u4e49\u8bcd&#xff1a;\u4e0d\u540c\u7684\u8bcd\u53ef\u80fd\u6709\u76f8\u540c\u542b\u4e49&#xff08;\u540c\u4e49\u8bcd&#xff09;&#xff0c;\u540c\u4e00\u4e2a\u8bcd\u5728\u4e0d\u540c\u4e0a\u4e0b\u6587\u53ef\u80fd\u6709\u4e0d\u540c\u542b\u4e49&#xff08;\u591a\u4e49\u8bcd&#xff09;\u3002<\/li>\n<li>\u5bf9\u77ed\u6587\u672c\u6548\u679c\u53ef\u80fd\u4e0d\u4f73\u3002<\/li>\n<\/ul>\n<p>\u968f\u7740\u6df1\u5ea6\u5b66\u4e60\u7684\u53d1\u5c55&#xff0c;\u50cf Word2Vec, GloVe, \u4ee5\u53ca\u57fa\u4e8eTransformer\u7684\u6a21\u578b&#xff08;\u5982 BERT&#xff09;\u80fd\u591f\u6355\u6349\u66f4\u4e30\u5bcc\u7684\u8bed\u4e49\u548c\u4e0a\u4e0b\u6587\u4fe1\u606f&#xff0c;\u6027\u80fd\u5f80\u5f80\u4f18\u4e8eTF-IDF\u3002\u7136\u800c&#xff0c;TF-IDF\u56e0\u5176\u7b80\u5355\u3001\u9ad8\u6548\u548c\u53ef\u89e3\u91ca\u6027\u5f3a&#xff0c;\u4ecd\u7136\u662f\u4e00\u4e2a\u4e0d\u53ef\u6216\u7f3a\u7684\u57fa\u7ebf\u5de5\u5177\u548c\u7279\u5f81\u5de5\u7a0b\u624b\u6bb5\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u6587\u7ae0\u6d4f\u89c8\u9605\u8bfb187\u6b21\uff0c\u70b9\u8d5e13\u6b21\uff0c\u6536\u85cf4\u6b21\u3002TF-IDF\u662f\u4e00\u79cd\u7edf\u8ba1\u65b9\u6cd5\uff0c\u7528\u4e8e\u8bc4\u4f30\u4e00\u4e2a\u8bcd\u5728\u6587\u6863\u96c6\u5408\uff08\u8bed\u6599\u5e93\uff09\u4e2d\u67d0\u4e2a\u6587\u6863\u91cc\u7684\u91cd\u8981\u7a0b\u5ea6\u3002\u8bcd\u9891 (Term Frequency, TF)\uff1a\u4e00\u4e2a\u8bcd\u5728\u6587\u6863\u4e2d\u51fa\u73b0\u7684\u6b21\u6570\u8d8a\u591a\uff0c\u5b83\u5bf9\u8fd9\u4e2a\u6587\u6863\u7684\u91cd\u8981\u6027\u53ef\u80fd\u5c31\u8d8a\u9ad8\u3002\u9006\u6587\u6863\u9891\u7387 (Inverse Document Frequency, IDF)\uff1a\u4e00\u4e2a\u8bcd\u5728\u6240\u6709\u6587\u6863\u4e2d\u51fa\u73b0\u7684\u6587\u6863\u6570\u8d8a\u5c11\uff08\u5373\u8d8a\u7a00\u6709\uff09\uff0c\u5b83\u533a\u5206\u6587\u6863\u7684\u80fd\u529b\u5c31\u8d8a\u5f3a\uff0c\u91cd\u8981\u6027\u5c31\u8d8a\u9ad8\u3002\u7b80\u5355\u6765\u8bf4\uff0cTF-IDF\u503c\u9ad8\u7684\u8bcd\uff0c\u662f\u90a3\u4e9b\u5728\u5f53\u524d\u6587\u6863\u4e2d\u9891\u7e41\u51fa\u73b0\uff0c\u4f46\u5728\u6574\u4e2a\u8bed\u6599\u5e93\u7684\u5176\u4ed6\u6587\u6863\u4e2d\u5f88\u5c11\u51fa\u73b0\u7684\u8bcd\u3002\u8fd9\u4e9b\u8bcd\u5f80\u5f80\u6700\u80fd\u4ee3\u8868\u8be5\u6587\u6863\u7684\u72ec\u7279\u4e3b\u9898\u3002<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[81,5333,50,207,427],"topic":[],"class_list":["post-52962","post","type-post","status-publish","format-standard","hentry","category-server","tag-python","tag-tf-idf","tag-50","tag-207","tag-427"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v20.3 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>\u673a\u5668\u5b66\u4e60TF-IDF\u7b97\u6cd5\u8be6\u89e3 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.wsisp.com\/helps\/52962.html\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"\u673a\u5668\u5b66\u4e60TF-IDF\u7b97\u6cd5\u8be6\u89e3 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"og:description\" content=\"\u6587\u7ae0\u6d4f\u89c8\u9605\u8bfb187\u6b21\uff0c\u70b9\u8d5e13\u6b21\uff0c\u6536\u85cf4\u6b21\u3002TF-IDF\u662f\u4e00\u79cd\u7edf\u8ba1\u65b9\u6cd5\uff0c\u7528\u4e8e\u8bc4\u4f30\u4e00\u4e2a\u8bcd\u5728\u6587\u6863\u96c6\u5408\uff08\u8bed\u6599\u5e93\uff09\u4e2d\u67d0\u4e2a\u6587\u6863\u91cc\u7684\u91cd\u8981\u7a0b\u5ea6\u3002\u8bcd\u9891 (Term Frequency, TF)\uff1a\u4e00\u4e2a\u8bcd\u5728\u6587\u6863\u4e2d\u51fa\u73b0\u7684\u6b21\u6570\u8d8a\u591a\uff0c\u5b83\u5bf9\u8fd9\u4e2a\u6587\u6863\u7684\u91cd\u8981\u6027\u53ef\u80fd\u5c31\u8d8a\u9ad8\u3002\u9006\u6587\u6863\u9891\u7387 (Inverse Document Frequency, IDF)\uff1a\u4e00\u4e2a\u8bcd\u5728\u6240\u6709\u6587\u6863\u4e2d\u51fa\u73b0\u7684\u6587\u6863\u6570\u8d8a\u5c11\uff08\u5373\u8d8a\u7a00\u6709\uff09\uff0c\u5b83\u533a\u5206\u6587\u6863\u7684\u80fd\u529b\u5c31\u8d8a\u5f3a\uff0c\u91cd\u8981\u6027\u5c31\u8d8a\u9ad8\u3002\u7b80\u5355\u6765\u8bf4\uff0cTF-IDF\u503c\u9ad8\u7684\u8bcd\uff0c\u662f\u90a3\u4e9b\u5728\u5f53\u524d\u6587\u6863\u4e2d\u9891\u7e41\u51fa\u73b0\uff0c\u4f46\u5728\u6574\u4e2a\u8bed\u6599\u5e93\u7684\u5176\u4ed6\u6587\u6863\u4e2d\u5f88\u5c11\u51fa\u73b0\u7684\u8bcd\u3002\u8fd9\u4e9b\u8bcd\u5f80\u5f80\u6700\u80fd\u4ee3\u8868\u8be5\u6587\u6863\u7684\u72ec\u7279\u4e3b\u9898\u3002\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.wsisp.com\/helps\/52962.html\" \/>\n<meta property=\"og:site_name\" content=\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"article:published_time\" content=\"2025-08-11T14:09:03+00:00\" \/>\n<meta name=\"author\" content=\"admin\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"admin\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"4 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/52962.html\",\"url\":\"https:\/\/www.wsisp.com\/helps\/52962.html\",\"name\":\"\u673a\u5668\u5b66\u4e60TF-IDF\u7b97\u6cd5\u8be6\u89e3 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"isPartOf\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\"},\"datePublished\":\"2025-08-11T14:09:03+00:00\",\"dateModified\":\"2025-08-11T14:09:03+00:00\",\"author\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\"},\"breadcrumb\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/52962.html#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.wsisp.com\/helps\/52962.html\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/52962.html#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/www.wsisp.com\/helps\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\u673a\u5668\u5b66\u4e60TF-IDF\u7b97\u6cd5\u8be6\u89e3\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\",\"url\":\"https:\/\/www.wsisp.com\/helps\/\",\"name\":\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"description\":\"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\",\"name\":\"admin\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"contentUrl\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"caption\":\"admin\"},\"sameAs\":[\"http:\/\/wp.wsisp.com\"],\"url\":\"https:\/\/www.wsisp.com\/helps\/author\/admin\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u673a\u5668\u5b66\u4e60TF-IDF\u7b97\u6cd5\u8be6\u89e3 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.wsisp.com\/helps\/52962.html","og_locale":"zh_CN","og_type":"article","og_title":"\u673a\u5668\u5b66\u4e60TF-IDF\u7b97\u6cd5\u8be6\u89e3 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","og_description":"\u6587\u7ae0\u6d4f\u89c8\u9605\u8bfb187\u6b21\uff0c\u70b9\u8d5e13\u6b21\uff0c\u6536\u85cf4\u6b21\u3002TF-IDF\u662f\u4e00\u79cd\u7edf\u8ba1\u65b9\u6cd5\uff0c\u7528\u4e8e\u8bc4\u4f30\u4e00\u4e2a\u8bcd\u5728\u6587\u6863\u96c6\u5408\uff08\u8bed\u6599\u5e93\uff09\u4e2d\u67d0\u4e2a\u6587\u6863\u91cc\u7684\u91cd\u8981\u7a0b\u5ea6\u3002\u8bcd\u9891 (Term Frequency, TF)\uff1a\u4e00\u4e2a\u8bcd\u5728\u6587\u6863\u4e2d\u51fa\u73b0\u7684\u6b21\u6570\u8d8a\u591a\uff0c\u5b83\u5bf9\u8fd9\u4e2a\u6587\u6863\u7684\u91cd\u8981\u6027\u53ef\u80fd\u5c31\u8d8a\u9ad8\u3002\u9006\u6587\u6863\u9891\u7387 (Inverse Document Frequency, IDF)\uff1a\u4e00\u4e2a\u8bcd\u5728\u6240\u6709\u6587\u6863\u4e2d\u51fa\u73b0\u7684\u6587\u6863\u6570\u8d8a\u5c11\uff08\u5373\u8d8a\u7a00\u6709\uff09\uff0c\u5b83\u533a\u5206\u6587\u6863\u7684\u80fd\u529b\u5c31\u8d8a\u5f3a\uff0c\u91cd\u8981\u6027\u5c31\u8d8a\u9ad8\u3002\u7b80\u5355\u6765\u8bf4\uff0cTF-IDF\u503c\u9ad8\u7684\u8bcd\uff0c\u662f\u90a3\u4e9b\u5728\u5f53\u524d\u6587\u6863\u4e2d\u9891\u7e41\u51fa\u73b0\uff0c\u4f46\u5728\u6574\u4e2a\u8bed\u6599\u5e93\u7684\u5176\u4ed6\u6587\u6863\u4e2d\u5f88\u5c11\u51fa\u73b0\u7684\u8bcd\u3002\u8fd9\u4e9b\u8bcd\u5f80\u5f80\u6700\u80fd\u4ee3\u8868\u8be5\u6587\u6863\u7684\u72ec\u7279\u4e3b\u9898\u3002","og_url":"https:\/\/www.wsisp.com\/helps\/52962.html","og_site_name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","article_published_time":"2025-08-11T14:09:03+00:00","author":"admin","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"admin","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"4 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.wsisp.com\/helps\/52962.html","url":"https:\/\/www.wsisp.com\/helps\/52962.html","name":"\u673a\u5668\u5b66\u4e60TF-IDF\u7b97\u6cd5\u8be6\u89e3 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","isPartOf":{"@id":"https:\/\/www.wsisp.com\/helps\/#website"},"datePublished":"2025-08-11T14:09:03+00:00","dateModified":"2025-08-11T14:09:03+00:00","author":{"@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41"},"breadcrumb":{"@id":"https:\/\/www.wsisp.com\/helps\/52962.html#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.wsisp.com\/helps\/52962.html"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.wsisp.com\/helps\/52962.html#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/www.wsisp.com\/helps"},{"@type":"ListItem","position":2,"name":"\u673a\u5668\u5b66\u4e60TF-IDF\u7b97\u6cd5\u8be6\u89e3"}]},{"@type":"WebSite","@id":"https:\/\/www.wsisp.com\/helps\/#website","url":"https:\/\/www.wsisp.com\/helps\/","name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","description":"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41","name":"admin","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/","url":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","contentUrl":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","caption":"admin"},"sameAs":["http:\/\/wp.wsisp.com"],"url":"https:\/\/www.wsisp.com\/helps\/author\/admin"}]}},"_links":{"self":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/52962","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/comments?post=52962"}],"version-history":[{"count":0,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/52962\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media?parent=52962"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/categories?post=52962"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/tags?post=52962"},{"taxonomy":"topic","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/topic?post=52962"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}