Files
prompt-engineering-for-deve…/content/LangChain Chat with Your Data/5.检索 retrieval.ipynb
2023-07-16 13:37:02 +08:00

1 line
48 KiB
Plaintext

{"cells": [{"attachments": {}, "cell_type": "markdown", "id": "0689733d", "metadata": {}, "source": ["# \u7b2c\u4e94\u7ae0 \u68c0\u7d22(Retrieval)", "\n", " - [\u4e00\u3001\u5411\u91cf\u6570\u636e\u5e93\u68c0\u7d22](#\u4e00\u3001\u5411\u91cf\u6570\u636e\u5e93\u68c0\u7d22)\n", " - [1.1 \u76f8\u4f3c\u6027\u641c\u7d22](#1.1-\u76f8\u4f3c\u6027\u641c\u7d22)\n", " - [1.2 \u89e3\u51b3\u591a\u6837\u6027\uff1a\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR)](#1.2-\u89e3\u51b3\u591a\u6837\u6027\uff1a\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR))\n", " - [1.3 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u4f7f\u7528\u5143\u6570\u636e](#1.3-\u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u4f7f\u7528\u5143\u6570\u636e)\n", " - [1.4 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u5728\u5143\u6570\u636e\u4e2d\u4f7f\u7528\u81ea\u67e5\u8be2\u68c0\u7d22\u5668](#1.4-\u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u5728\u5143\u6570\u636e\u4e2d\u4f7f\u7528\u81ea\u67e5\u8be2\u68c0\u7d22\u5668)\n", " - [1.5 \u5176\u4ed6\u6280\u5de7\uff1a\u538b\u7f29](#1.5-\u5176\u4ed6\u6280\u5de7\uff1a\u538b\u7f29)\n", " - [\u4e8c\u3001\u7ed3\u5408\u5404\u79cd\u6280\u672f](#\u4e8c\u3001\u7ed3\u5408\u5404\u79cd\u6280\u672f)\n", " - [\u4e09\u3001\u5176\u4ed6\u7c7b\u578b\u7684\u68c0\u7d22](#\u4e09\u3001\u5176\u4ed6\u7c7b\u578b\u7684\u68c0\u7d22)\n"]}, {"attachments": {}, "cell_type": "markdown", "id": "d12fbd74", "metadata": {}, "source": ["\n", "\u68c0\u7d22\u662f\u6211\u4eec\u7684\u68c0\u7d22\u589e\u5f3a\u751f\u6210(RAG)\u6d41\u7a0b\u7684\u6838\u5fc3\u3002\n", "\n", "\u8ba9\u6211\u4eec\u83b7\u5f97\u524d\u9762\u8bfe\u7a0b\u5b58\u50a8\u7684\u5411\u91cf\u6570\u636e\u5e93(`VectorDB`)\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "ed2569c6", "metadata": {}, "source": ["## \u4e00\u3001\u5411\u91cf\u6570\u636e\u5e93\u68c0\u7d22"]}, {"attachments": {}, "cell_type": "markdown", "id": "651a10db", "metadata": {}, "source": ["\u5728\u5f53\u524d\u6587\u4ef6\u5939\u4e0b\u65b0\u5efa`.env`\u6587\u4ef6\uff0c\u5185\u5bb9\u4e3a`OPENAI_API_KEY = \"sk-...\"`\n", "\n", "\u672c\u7ae0\u8282\u9700\u8981\u4f7f\u7528`lark`\u5305\uff0c\u8fd0\u884c\u4ee5\u4e0b\u547d\u4ee4\u5b89\u88c5"]}, {"cell_type": "code", "execution_count": 1, "id": "c18f8a7b-62af-403e-9877-18d1c2265b4f", "metadata": {"tags": []}, "outputs": [], "source": ["!pip install -Uq lark"]}, {"cell_type": "code", "execution_count": 3, "id": "51b15e5c-9b92-4d40-a149-b56335d330d9", "metadata": {"tags": []}, "outputs": [], "source": ["import os\n", "import openai\n", "import sys\n", "sys.path.append('../..')\n", "\n", "from dotenv import load_dotenv, find_dotenv\n", "_ = load_dotenv(find_dotenv()) # read local .env file\n", "\n", "openai.api_key = os.environ['OPENAI_API_KEY']\n"]}, {"attachments": {}, "cell_type": "markdown", "id": "c2d552e1", "metadata": {}, "source": ["### 1.1 \u76f8\u4f3c\u6027\u641c\u7d22"]}, {"cell_type": "code", "execution_count": 4, "id": "fe368042", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.vectorstores import Chroma\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "persist_directory = 'docs/chroma/cs229_lectures/'\n", "persist_directory_chinese = 'docs/chroma/matplotlib/'"]}, {"attachments": {}, "cell_type": "markdown", "id": "5ba63e21", "metadata": {}, "source": ["\u5c06\u4e0a\u8282\u8bfe\u6240\u4fdd\u5b58\u7684\u5411\u91cf\u6570\u636e\u5e93(`VectorDB`)\u52a0\u8f7d\u8fdb\u6765"]}, {"cell_type": "code", "execution_count": 5, "id": "a0189dc5", "metadata": {"tags": []}, "outputs": [], "source": ["embedding = OpenAIEmbeddings()"]}, {"cell_type": "code", "execution_count": 6, "id": "2be10170", "metadata": {}, "outputs": [], "source": ["vectordb = Chroma(\n", " persist_directory=persist_directory,\n", " embedding_function=embedding\n", ")"]}, {"cell_type": "code", "execution_count": 7, "id": "3659e0f7", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["209\n"]}], "source": ["print(vectordb._collection.count())"]}, {"cell_type": "code", "execution_count": 8, "id": "a01ab000", "metadata": {}, "outputs": [], "source": ["vectordb_chinese = Chroma(\n", " persist_directory=persist_directory_chinese,\n", " embedding_function=embedding\n", ")"]}, {"cell_type": "code", "execution_count": 9, "id": "a6998a03", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["27\n"]}], "source": ["print(vectordb_chinese._collection.count())"]}, {"attachments": {}, "cell_type": "markdown", "id": "9ae4fdd8", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u73b0\u5728\u6765\u770b\u770b\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027\u7684\u4f8b\u5b50\u3002\u56e0\u6b64\uff0c\u6211\u4eec\u5c06\u4ece\u4e0b\u9762\u793a\u4f8b\u4e2d\u52a0\u8f7d\u6709\u5173\u8611\u83c7\u7684\u4fe1\u606f\u3002\n", "\n", "\u8ba9\u6211\u4eec\u73b0\u5728\u8fd0\u884c\u5b83\u4e0eMMR\u3002\u8ba9\u6211\u4eec\u4f20\u5165k\u7b49\u4e8e2\u3002\u6211\u4eec\u4ecd\u7136\u5e0c\u671b\u8fd4\u56de\u4e24\u4e2a\u6587\u6863\uff0c\u4f46\u8ba9\u6211\u4eec\u8bbe\u7f6e\u83b7\u53d6k\u7b49\u4e8e3\uff0c\u5176\u4e2d\u6211\u4eec\u6700\u521d\u83b7\u53d6\u6240\u6709\u4e09\u4e2a\u6587\u6863\u3002\u6211\u4eec\u73b0\u5728\u53ef\u4ee5\u770b\u5230\uff0c\u6211\u4eec\u68c0\u7d22\u7684\u6587\u6863\u4e2d\u8fd4\u56de\u4e86\u6709\u6bd2\u7684\u4fe1\u606f\u3002"]}, {"cell_type": "code", "execution_count": 10, "id": "a807c758", "metadata": {"tags": []}, "outputs": [], "source": ["texts = [\n", " \"\"\"The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).\"\"\",\n", " \"\"\"A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.\"\"\",\n", " \"\"\"A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.\"\"\",\n", "]"]}, {"cell_type": "code", "execution_count": 11, "id": "b110cceb", "metadata": {}, "outputs": [], "source": ["texts_chinese = [\n", " \"\"\"\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u5177\u6709\u5927\u578b\u4e14\u5f15\u4eba\u6ce8\u76ee\u7684\u5730\u4e0a\uff08epigeous\uff09\u5b50\u5b9e\u4f53\uff08basidiocarp\uff09\"\"\",\n", " \"\"\"\u4e00\u79cd\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u8611\u83c7\u662f\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u3002\u67d0\u4e9b\u54c1\u79cd\u5168\u767d\u3002\"\"\",\n", " \"\"\"A. phalloides\uff0c\u53c8\u540d\u6b7b\u4ea1\u5e3d\uff0c\u662f\u5df2\u77e5\u6240\u6709\u8611\u83c7\u4e2d\u6700\u6709\u6bd2\u7684\u4e00\u79cd\u3002\"\"\",\n", "]"]}, {"attachments": {}, "cell_type": "markdown", "id": "84cd5f1c", "metadata": {}, "source": ["\u5bf9\u4e8e\u8fd9\u4e2a\u4f8b\u5b50\uff0c\u6211\u4eec\u5c06\u521b\u5efa\u4e00\u4e2a\u5c0f\u6570\u636e\u5e93\uff0c\u6211\u4eec\u53ef\u4ee5\u4f5c\u4e3a\u4e00\u4e2a\u793a\u4f8b\u6765\u4f7f\u7528\u3002"]}, {"cell_type": "code", "execution_count": 12, "id": "715d54f3", "metadata": {"tags": []}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 1.28it/s]\n"]}], "source": ["smalldb = Chroma.from_texts(texts, embedding=embedding)"]}, {"cell_type": "code", "execution_count": 13, "id": "305e1714", "metadata": {}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 2.30it/s]\n"]}], "source": ["smalldb_chinese = Chroma.from_texts(texts_chinese, embedding=embedding)"]}, {"attachments": {}, "cell_type": "markdown", "id": "239a8d95", "metadata": {}, "source": ["\u4e0b\u9762\u662f\u6211\u4eec\u5bf9\u4e8e\u8fd9\u4e2a\u793a\u4f8b\u6240\u63d0\u51fa\u7684\u95ee\u9898"]}, {"cell_type": "code", "execution_count": 14, "id": "9a37b5a5", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"Tell me about all-white mushrooms with large fruiting bodies\""]}, {"cell_type": "code", "execution_count": 15, "id": "92312e57", "metadata": {}, "outputs": [], "source": ["question_chinese = \"\u544a\u8bc9\u6211\u5173\u4e8e\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u5168\u767d\u8272\u8611\u83c7\u7684\u4fe1\u606f\""]}, {"attachments": {}, "cell_type": "markdown", "id": "d3224a6d", "metadata": {}, "source": ["\u73b0\u5728\uff0c\u6211\u4eec\u53ef\u4ee5\u8fd0\u884c\u4e00\u4e2a\u76f8\u4f3c\u6027\u641c\u7d22\uff0c\u8bbe\u7f6ek=2\uff0c\u53ea\u8fd4\u56de\u4e24\u4e2a\u6700\u76f8\u5173\u7684\u6587\u6863\u3002\n", "\n", "\u6211\u4eec\u53ef\u4ee5\u770b\u5230\uff0c\u6ca1\u6709\u63d0\u5230\u5b83\u662f\u6709\u6bd2\u7684\u4e8b\u5b9e\u3002"]}, {"cell_type": "code", "execution_count": 16, "id": "24e3b025", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),\n", " Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).', metadata={})]"]}, "execution_count": 16, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb.similarity_search(question, k=2)"]}, {"cell_type": "code", "execution_count": 17, "id": "d4c5a47d", "metadata": {}, "outputs": [{"data": {"text/plain": ["[Document(page_content='\u4e00\u79cd\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u8611\u83c7\u662f\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u3002\u67d0\u4e9b\u54c1\u79cd\u5168\u767d\u3002', metadata={}),\n", " Document(page_content='\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u5177\u6709\u5927\u578b\u4e14\u5f15\u4eba\u6ce8\u76ee\u7684\u5730\u4e0a\uff08epigeous\uff09\u5b50\u5b9e\u4f53\uff08basidiocarp\uff09', metadata={})]"]}, "execution_count": 17, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb_chinese.similarity_search(question_chinese, k=2)"]}, {"attachments": {}, "cell_type": "markdown", "id": "bbb0ea94", "metadata": {}, "source": ["\u73b0\u5728\uff0c\u8ba9\u6211\u4eec\u8fd0\u884c\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR)\u3002\n", "\n", "\u8bbe\u7f6ek=2\uff0c\u56e0\u4e3a\u6211\u4eec\u4ecd\u7136\u5e0c\u671b\u8fd4\u56de\u4e24\u4e2a\u6587\u6863\u3002\u8bbe\u7f6efetch_k=3\uff0cfetch_k\u662f\u6211\u4eec\u6700\u521d\u83b7\u53d6\u7684\u6240\u6709\u6587\u6863(3\u4e2a)\u3002"]}, {"cell_type": "code", "execution_count": 18, "id": "4daa6c0d", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),\n", " Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.', metadata={})]"]}, "execution_count": 18, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)"]}, {"cell_type": "code", "execution_count": 19, "id": "e15521d2", "metadata": {}, "outputs": [{"data": {"text/plain": ["[Document(page_content='\u4e00\u79cd\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u8611\u83c7\u662f\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u3002\u67d0\u4e9b\u54c1\u79cd\u5168\u767d\u3002', metadata={}),\n", " Document(page_content='A. phalloides\uff0c\u53c8\u540d\u6b7b\u4ea1\u5e3d\uff0c\u662f\u5df2\u77e5\u6240\u6709\u8611\u83c7\u4e2d\u6700\u6709\u6bd2\u7684\u4e00\u79cd\u3002', metadata={})]"]}, "execution_count": 19, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb_chinese.max_marginal_relevance_search(question,k=2, fetch_k=3)"]}, {"attachments": {}, "cell_type": "markdown", "id": "e87c5f91", "metadata": {}, "source": ["\u6211\u4eec\u73b0\u5728\u53ef\u4ee5\u770b\u5230\uff0c\u6211\u4eec\u68c0\u7d22\u7684\u6587\u6863\u4e2d\u8fd4\u56de\u4e86\u6709\u6bd2\u7684\u4fe1\u606f\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "5a29e8c9", "metadata": {}, "source": ["### 1.2 \u89e3\u51b3\u591a\u6837\u6027\uff1a\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR)"]}, {"attachments": {}, "cell_type": "markdown", "id": "a2b5c4ae", "metadata": {}, "source": ["\n", "\u6211\u4eec\u521a\u521a\u901a\u8fc7\u4e00\u4e2a\u793a\u4f8b\u5f15\u51fa\u4e86\u4e00\u4e2a\u95ee\u9898\uff1a\u5982\u4f55\u52a0\u5f3a\u641c\u7d22\u7ed3\u679c\u7684\u591a\u6837\u6027\u3002\n", " \n", "\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(`Maximum marginal relevance`)\u8bd5\u56fe\u5728\u67e5\u8be2\u7684\u76f8\u5173\u6027\u548c\u7ed3\u679c\u7684\u591a\u6837\u6027\u4e4b\u95f4\u5b9e\u73b0\u4e24\u5168\u5176\u7f8e\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "2360545c", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u56de\u5230\u4e0a\u4e00\u8282\u8bfe\u7684\u4e00\u4e2a\u4f8b\u5b50\uff0c\u5f53\u6211\u4eec\u901a\u8fc7\u95ee\u9898\u5bf9\u5411\u91cf\u6570\u636e\u5e93\u8fdb\u884c\u76f8\u4f3c\u6027\u641c\u7d22\u540e\n", "\n", "\u6211\u4eec\u53ef\u4ee5\u770b\u770b\u524d\u4e24\u4e2a\u6587\u6863\uff0c\u53ea\u770b\u524d\u51e0\u4e2a\u5b57\u7b26\uff0c\u53ef\u4ee5\u770b\u5230\u5b83\u4eec\u662f\u76f8\u540c\u7684\u3002"]}, {"cell_type": "code", "execution_count": 20, "id": "9bb2c0a9", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"what did they say about matlab?\"\n", "docs_ss = vectordb.similarity_search(question,k=3)"]}, {"cell_type": "code", "execution_count": 21, "id": "f07f8793", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'those homeworks will be done in either MATLA B or in Octave, which is sort of \u2014 I \\nknow some people '"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss[0].page_content[:100]"]}, {"cell_type": "code", "execution_count": 22, "id": "e9f7e165", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'those homeworks will be done in either MATLA B or in Octave, which is sort of \u2014 I \\nknow some people '"]}, "execution_count": 22, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss[1].page_content[:100]"]}, {"cell_type": "code", "execution_count": 23, "id": "e8e142eb", "metadata": {}, "outputs": [], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "docs_ss_chinese = vectordb_chinese.similarity_search(question_chinese,k=3)"]}, {"cell_type": "code", "execution_count": 24, "id": "cf642f66", "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd'"]}, "execution_count": 24, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss_chinese[0].page_content[:100]"]}, {"cell_type": "code", "execution_count": 25, "id": "1e9f5cfe", "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd'"]}, "execution_count": 25, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss_chinese[1].page_content[:100]"]}, {"attachments": {}, "cell_type": "markdown", "id": "4c4ca1b6", "metadata": {}, "source": ["\u6ce8\u610f\uff1a\u4f7f\u7528`MMR`\u6240\u5f97\u51fa\u7ed3\u679c\u7684\u5dee\u5f02\u3002"]}, {"cell_type": "code", "execution_count": 26, "id": "222234c5", "metadata": {"tags": []}, "outputs": [], "source": ["docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)"]}, {"cell_type": "code", "execution_count": 27, "id": "408935bc", "metadata": {}, "outputs": [], "source": ["docs_mmr_chinese = vectordb_chinese.max_marginal_relevance_search(question_chinese,k=3)"]}, {"attachments": {}, "cell_type": "markdown", "id": "9076db81", "metadata": {}, "source": ["\u5f53\u6211\u4eec\u8fd0\u884cMMR\u540e\u5f97\u5230\u7ed3\u679c\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u7b2c\u4e00\u4e2a\u4e0e\u4e4b\u524d\u7684\u76f8\u540c\uff0c\u56e0\u4e3a\u90a3\u662f\u6700\u76f8\u4f3c\u7684\u3002"]}, {"cell_type": "code", "execution_count": 28, "id": "93b20226", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'those homeworks will be done in either MATLA B or in Octave, which is sort of \u2014 I \\nknow some people '"]}, "execution_count": 28, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr[0].page_content[:100]"]}, {"cell_type": "code", "execution_count": 29, "id": "d0acfaab", "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd'"]}, "execution_count": 29, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr_chinese[0].page_content[:100]"]}, {"attachments": {}, "cell_type": "markdown", "id": "7a93743f", "metadata": {}, "source": ["\u4f46\u662f\u5f53\u6211\u4eec\u8fdb\u884c\u5230\u7b2c\u4e8c\u4e2a\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u5b83\u662f\u4e0d\u540c\u7684\u3002\n", "\n", "\u5b83\u5728\u56de\u5e94\u4e2d\u83b7\u5f97\u4e86\u4e00\u4e9b\u591a\u6837\u6027\u3002"]}, {"cell_type": "code", "execution_count": 30, "id": "17d39762", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'algorithm then? So what\u2019s different? How come I was making all that noise earlier about \\nleast squa'"]}, "execution_count": 30, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr[1].page_content[:100]"]}, {"cell_type": "code", "execution_count": 31, "id": "93d3206c", "metadata": {}, "outputs": [{"data": {"text/plain": ["'By Datawhale \u6570\u636e\u53ef\u89c6\u5316\u5f00\u6e90\u2f29\u7ec4\\n\u00a9 Copyright \u00a9 Copyright 2021.y\u8f74\u5206\u4e3a\u5de6\u53f3\u4e24\u4e2a\uff0c\u56e0\u6b64 tick1 \u5bf9\u5e94\u5de6\u4fa7\u7684\u8f74\uff1b tick2 \u5bf9\u5e94\u53f3\u4fa7\u7684\u8f74\u3002\\nx\u8f74\u5206\u4e3a\u4e0a\u4e0b\u4e24\u4e2a'"]}, "execution_count": 31, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr_chinese[1].page_content[:100]"]}, {"attachments": {}, "cell_type": "markdown", "id": "b2b909bc", "metadata": {}, "source": ["### 1.3 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u4f7f\u7528\u5143\u6570\u636e"]}, {"attachments": {}, "cell_type": "markdown", "id": "7b63c5ee", "metadata": {}, "source": ["\n", "\u5728\u4e0a\u4e00\u8282\u8bfe\u4e2d\uff0c\u6211\u4eec\u5c55\u793a\u4e86\u4e00\u4e2a\u95ee\u9898\uff0c\u662f\u8be2\u95ee\u4e86\u5173\u4e8e\u6587\u6863\u4e2d\u67d0\u4e00\u8bb2\u7684\u95ee\u9898\uff0c\u4f46\u5f97\u5230\u7684\u7ed3\u679c\u4e2d\u4e5f\u5305\u62ec\u4e86\u6765\u81ea\u5176\u4ed6\u8bb2\u7684\u7ed3\u679c\u3002\n", "\n", "\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e00\u95ee\u9898\uff0c\u5f88\u591a\u5411\u91cf\u6570\u636e\u5e93\u90fd\u652f\u6301\u5bf9`metadata`\u7684\u64cd\u4f5c\u3002\n", "\n", "`metadata`\u4e3a\u6bcf\u4e2a\u5d4c\u5165\u7684\u5757(embedded chunk)\u63d0\u4f9b\u4e0a\u4e0b\u6587\u3002"]}, {"cell_type": "code", "execution_count": 32, "id": "3c1a60b2", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"what did they say about regression in the third lecture?\""]}, {"cell_type": "code", "execution_count": 33, "id": "ba98df3c", "metadata": {}, "outputs": [], "source": ["question_chinese = \"\u4ed6\u4eec\u5728\u7b2c\u4e8c\u8bb2\u4e2d\u5bf9Figure\u8bf4\u4e86\u4e9b\u4ec0\u4e48\uff1f\" "]}, {"attachments": {}, "cell_type": "markdown", "id": "3873525e", "metadata": {}, "source": ["\u73b0\u5728\uff0c\u6211\u4eec\u4ee5\u624b\u52a8\u7684\u65b9\u5f0f\u6765\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u6211\u4eec\u4f1a\u6307\u5b9a\u4e00\u4e2a\u5143\u6570\u636e\u8fc7\u6ee4\u5668`filter`"]}, {"cell_type": "code", "execution_count": 34, "id": "a8612840", "metadata": {"tags": []}, "outputs": [], "source": ["docs = vectordb.similarity_search(\n", " question,\n", " k=3,\n", " filter={\"source\":\"docs/cs229_lectures/MachineLearning-Lecture03.pdf\"}\n", ")"]}, {"cell_type": "code", "execution_count": 35, "id": "b46c7e76", "metadata": {}, "outputs": [], "source": ["docs_chinese = vectordb_chinese.similarity_search(\n", " question_chinese,\n", " k=3,\n", " filter={\"source\":\"docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf\"}\n", ")"]}, {"attachments": {}, "cell_type": "markdown", "id": "869aee28", "metadata": {}, "source": ["\u63a5\u4e0b\u6765\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u7ed3\u679c\u90fd\u6765\u81ea\u5bf9\u5e94\u7684\u7ae0\u8282"]}, {"cell_type": "code", "execution_count": 36, "id": "97031876", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 4}\n"]}], "source": ["for d in docs:\n", " print(d.metadata)"]}, {"cell_type": "code", "execution_count": 35, "id": "2708f6ae", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'source': 'docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf', 'page': 9}\n", "{'source': 'docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf', 'page': 10}\n", "{'source': 'docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf', 'page': 0}\n"]}], "source": ["for d in docs_chinese:\n", " print(d.metadata)\n", " "]}, {"attachments": {}, "cell_type": "markdown", "id": "5e299f8e", "metadata": {}, "source": ["\u5f53\u7136\uff0c\u6211\u4eec\u4e0d\u80fd\u6bcf\u6b21\u90fd\u91c7\u7528\u624b\u52a8\u7684\u65b9\u5f0f\u6765\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u8fd9\u4f1a\u663e\u5f97\u4e0d\u591f\u667a\u80fd\n", "\n", "\u4e0b\u4e00\u5c0f\u8282\u5c06\u8981\u5c55\u793a\u901a\u8fc7LLM\u6765\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898"]}, {"attachments": {}, "cell_type": "markdown", "id": "ccc2d784", "metadata": {}, "source": ["### 1.4 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u5728\u5143\u6570\u636e\u4e2d\u4f7f\u7528\u81ea\u67e5\u8be2\u68c0\u7d22\u5668"]}, {"attachments": {}, "cell_type": "markdown", "id": "82ef44b6", "metadata": {}, "source": ["\u6211\u4eec\u6709\u4e00\u4e2a\u6709\u8da3\u7684\u6311\u6218\uff1a\u6211\u4eec\u901a\u5e38\u5e0c\u671b\u4ece\u67e5\u8be2\u672c\u8eab\u6765\u63a8\u65ad\u5143\u6570\u636e\u3002\n", "\n", "\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528SelfQueryRetriever\uff0c\u5b83\u4f7f\u7528LLM\u6765\u63d0\u53d6\uff1a\n", " \n", "1. \u7528\u4e8e\u5411\u91cf\u641c\u7d22\u7684\u67e5\u8be2(`query`)\u5b57\u7b26\u4e32\uff0c\u5373\uff1a\u95ee\u9898\n", "2. \u8981\u4e00\u8d77\u4f20\u5165\u7684\u5143\u6570\u636e\u8fc7\u6ee4\u5668\n", "\n", "\u5927\u591a\u6570\u5411\u91cf\u6570\u636e\u5e93\u652f\u6301\u5143\u6570\u636e\u8fc7\u6ee4\u5668\uff0c\u56e0\u6b64\u4e0d\u9700\u8981\u4efb\u4f55\u65b0\u7684\u6570\u636e\u5e93\u53ca\u7d22\u5f15\u3002"]}, {"cell_type": "code", "execution_count": 38, "id": "b1d06084", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.llms import OpenAI\n", "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", "from langchain.chains.query_constructor.base import AttributeInfo"]}, {"cell_type": "code", "execution_count": 39, "id": "869c27c0", "metadata": {}, "outputs": [], "source": ["llm = OpenAI(temperature=0)"]}, {"attachments": {}, "cell_type": "markdown", "id": "acd194c5", "metadata": {}, "source": ["`AttributeInfo`\u662f\u6211\u4eec\u53ef\u4ee5\u6307\u5b9a\u5143\u6570\u636e\u4e2d\u7684\u4e0d\u540c\u5b57\u6bb5\u4ee5\u53ca\u5b83\u4eec\u5bf9\u5e94\u7684\u4f4d\u7f6e\u3002\n", "\n", "\u5728\u5143\u6570\u636e\u4e2d\uff0c\u6211\u4eec\u53ea\u6709\u4e24\u4e2a\u5b57\u6bb5\uff0c`source`\u548c`page`\u3002\n", "\n", "\u6211\u4eec\u5c06\u586b\u5199\u6bcf\u4e2a\u5c5e\u6027\u7684\u540d\u79f0\u3001\u63cf\u8ff0\u548c\u7c7b\u578b\u7684\u63cf\u8ff0\u3002\n", "\n", "\u8fd9\u4e9b\u4fe1\u606f\u5b9e\u9645\u4e0a\u5c06\u88ab\u4f20\u9012\u7ed9LLM\uff0c\u6240\u4ee5\u9700\u8981\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u63cf\u8ff0\u3002"]}, {"cell_type": "code", "execution_count": 40, "id": "0aa5e698", "metadata": {"tags": []}, "outputs": [], "source": ["metadata_field_info = [\n", " AttributeInfo(\n", " name=\"source\",\n", " description=\"The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`\",\n", " type=\"string\",\n", " ),\n", " AttributeInfo(\n", " name=\"page\",\n", " description=\"The page from the lecture\",\n", " type=\"integer\",\n", " ),\n", "]"]}, {"cell_type": "code", "execution_count": 41, "id": "544ad7c1", "metadata": {}, "outputs": [], "source": ["metadata_field_info_chinese = [\n", " AttributeInfo(\n", " name=\"source\",\n", " description=\"\u8bb2\u4e49\u6765\u6e90\u4e8e `docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf`, `docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf`, or `docs/matplotlib/\u7b2c\u4e09\u56de\uff1a\u5e03\u5c40\u683c\u5f0f\u5b9a\u65b9\u5706.pdf` \u7684\u5176\u4e2d\u4e4b\u4e00\",\n", " type=\"string\",\n", " ),\n", " AttributeInfo(\n", " name=\"page\",\n", " description=\"\u8bb2\u4e49\u7684\u90a3\u4e00\u9875\",\n", " type=\"integer\",\n", " ),\n", "]"]}, {"cell_type": "code", "execution_count": 42, "id": "e7906c15", "metadata": {"tags": []}, "outputs": [], "source": ["document_content_description = \"Lecture notes\"\n", "retriever = SelfQueryRetriever.from_llm(\n", " llm,\n", " vectordb,\n", " document_content_description,\n", " metadata_field_info,\n", " verbose=True\n", ")"]}, {"cell_type": "code", "execution_count": 43, "id": "d5b99571", "metadata": {}, "outputs": [], "source": ["document_content_description_chinese = \"\u8bfe\u5802\u8bb2\u4e49\"\n", "retriever_chinese = SelfQueryRetriever.from_llm(\n", " llm,\n", " vectordb_chinese,\n", " document_content_description_chinese,\n", " metadata_field_info_chinese,\n", " verbose=True\n", ")"]}, {"cell_type": "code", "execution_count": 44, "id": "79d781b9", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"what did they say about regression in the third lecture?\""]}, {"cell_type": "code", "execution_count": 45, "id": "8d9b7e18", "metadata": {}, "outputs": [], "source": ["question_chinese = \"\u4ed6\u4eec\u5728\u7b2c\u4e8c\u8bb2\u4e2d\u5bf9Figure\u8bf4\u4e86\u4e9b\u4ec0\u4e48\uff1f\" "]}, {"attachments": {}, "cell_type": "markdown", "id": "c51778b0-1fcd-40a4-bd6b-0f13fec8acb1", "metadata": {}, "source": ["\u5f53\u4f60\u7b2c\u4e00\u6b21\u6267\u884c\u4e0b\u4e00\u884c\u65f6\uff0c\u4f60\u4f1a\u6536\u5230\u5173\u4e8epredict_and_parse\u5df2\u88ab\u5f03\u7528\u7684**\u8b66\u544a**\u3002 \u8fd9\u53ef\u4ee5\u5b89\u5168\u5730\u5ffd\u7565\u3002"]}, {"cell_type": "code", "execution_count": 46, "id": "1d4f9f7d", "metadata": {"tags": []}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["/root/autodl-tmp/env/gpt/lib/python3.10/site-packages/langchain/chains/llm.py:275: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", " warnings.warn(\n"]}, {"name": "stdout", "output_type": "stream", "text": ["query='regression' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='source', value='docs/cs229_lectures/MachineLearning-Lecture03.pdf') limit=None\n"]}], "source": ["docs = retriever.get_relevant_documents(question)"]}, {"cell_type": "code", "execution_count": 47, "id": "ea39a97e", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["query='Figure' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='source', value='docs/matplotlib/\u7b2c\u4e8c\u8bb2\uff1a\u827a\u672f\u753b\u89e3\u7834.pdf') limit=None\n"]}], "source": ["docs_chinese = retriever_chinese.get_relevant_documents(question_chinese)"]}, {"cell_type": "code", "execution_count": 48, "id": "db04374e", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 10}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 10}\n"]}], "source": ["for d in docs:\n", " print(d.metadata)"]}, {"cell_type": "code", "execution_count": 49, "id": "143061f5", "metadata": {}, "outputs": [], "source": ["for d in docs_chinese:\n", " print(d.metadata)"]}, {"attachments": {}, "cell_type": "markdown", "id": "297b8168", "metadata": {}, "source": ["### 1.5 \u5176\u4ed6\u6280\u5de7\uff1a\u538b\u7f29"]}, {"attachments": {}, "cell_type": "markdown", "id": "564144da", "metadata": {}, "source": ["\u53e6\u4e00\u79cd\u63d0\u9ad8\u68c0\u7d22\u5230\u7684\u6587\u6863\u8d28\u91cf\u7684\u65b9\u6cd5\u662f\u538b\u7f29\u3002\n", "\n", "\u4e0e\u67e5\u8be2\u6700\u76f8\u5173\u7684\u4fe1\u606f\u53ef\u80fd\u9690\u85cf\u5728\u5177\u6709\u5927\u91cf\u4e0d\u76f8\u5173\u6587\u672c\u7684\u6587\u6863\u4e2d\u3002\n", "\n", "\u5728\u5e94\u7528\u7a0b\u5e8f\u4e2d\u4f20\u9012\u5b8c\u6574\u7684\u6587\u6863\u53ef\u80fd\u4f1a\u5bfc\u81f4\u66f4\u6602\u8d35\u7684LLM\u8c03\u7528\u548c\u66f4\u5dee\u7684\u54cd\u5e94\u3002\n", "\n", "\u4e0a\u4e0b\u6587\u538b\u7f29\u5c31\u662f\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\u3002"]}, {"cell_type": "code", "execution_count": 50, "id": "a060cf74", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.retrievers import ContextualCompressionRetriever\n", "from langchain.retrievers.document_compressors import LLMChainExtractor"]}, {"cell_type": "code", "execution_count": 51, "id": "038649c8", "metadata": {"tags": []}, "outputs": [], "source": ["def pretty_print_docs(docs):\n", " print(f\"\\n{'-' * 100}\\n\".join([f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]))"]}, {"cell_type": "code", "execution_count": 52, "id": "fc686cf2", "metadata": {"tags": []}, "outputs": [], "source": ["llm = OpenAI(temperature=0)\n", "compressor = LLMChainExtractor.from_llm(llm) # \u538b\u7f29\u5668"]}, {"cell_type": "code", "execution_count": 53, "id": "82794397", "metadata": {"tags": []}, "outputs": [], "source": ["compression_retriever = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 54, "id": "915598f8", "metadata": {}, "outputs": [], "source": ["compression_retriever_chinese = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb_chinese.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 55, "id": "cde86848", "metadata": {"tags": []}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["/root/autodl-tmp/env/gpt/lib/python3.10/site-packages/langchain/chains/llm.py:275: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", " warnings.warn(\n"]}, {"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 2:\n", "\n", "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 3:\n", "\n", "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 4:\n", "\n", "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n"]}], "source": ["question = \"what did they say about matlab?\"\n", "compressed_docs = compression_retriever.get_relevant_documents(question)\n", "pretty_print_docs(compressed_docs)"]}, {"cell_type": "code", "execution_count": 56, "id": "39726b24", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "Matplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\n", "----------------------------------------------------------------------------------------------------\n", "Document 2:\n", "\n", "Matplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\n"]}], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "compressed_docs_chinese = compression_retriever_chinese.get_relevant_documents(question_chinese)\n", "pretty_print_docs(compressed_docs_chinese)"]}, {"attachments": {}, "cell_type": "markdown", "id": "049b2601", "metadata": {}, "source": ["\u73b0\u5728\u5f53\u6211\u4eec\u63d0\u51fa\u95ee\u9898\u540e\uff0c\u67e5\u770b\u7ed3\u679c\u6587\u6863\n", "\n", "\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u4e24\u4ef6\u4e8b\u3002\n", "\n", "1. \u5b83\u4eec\u6bd4\u6b63\u5e38\u6587\u6863\u77ed\u5f88\u591a\n", "2. \u4ecd\u7136\u6709\u4e00\u4e9b\u91cd\u590d\u7684\u4e1c\u897f\uff0c\u8fd9\u662f\u56e0\u4e3a\u5728\u5e95\u5c42\u6211\u4eec\u4f7f\u7528\u7684\u662f\u8bed\u4e49\u641c\u7d22\u7b97\u6cd5\u3002\n", "\n", "\u8fd9\u5c31\u662f\u6211\u4eec\u5728\u672c\u8bfe\u7a0b\u524d\u9762\u4f7f\u7528MMR\u89e3\u51b3\u7684\u95ee\u9898\u3002\n", "\n", "\u8fd9\u662f\u4e00\u4e2a\u5f88\u597d\u7684\u4f8b\u5b50\uff0c\u4f60\u53ef\u4ee5\u7ed3\u5408\u5404\u79cd\u6280\u672f\u5f97\u5230\u6700\u597d\u7684\u53ef\u80fd\u7ed3\u679c\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "82c4fc4d", "metadata": {}, "source": ["## \u4e8c\u3001\u7ed3\u5408\u5404\u79cd\u6280\u672f"]}, {"attachments": {}, "cell_type": "markdown", "id": "54432975", "metadata": {}, "source": ["\u4e3a\u4e86\u505a\u5230\u8fd9\u4e00\u70b9\uff0c\u6211\u4eec\u5728\u4ece\u5411\u91cf\u6570\u636e\u5e93\u521b\u5efa\u68c0\u7d22\u5668\u65f6\uff0c\u53ef\u4ee5\u5c06\u641c\u7d22\u7c7b\u578b\u8bbe\u7f6e\u4e3aMMR\u3002\n", "\n", "\u7136\u540e\u6211\u4eec\u53ef\u4ee5\u91cd\u65b0\u8fd0\u884c\u8fd9\u4e2a\u8fc7\u7a0b\uff0c\u770b\u5230\u6211\u4eec\u8fd4\u56de\u7684\u662f\u4e00\u4e2a\u8fc7\u6ee4\u8fc7\u7684\u7ed3\u679c\u96c6\uff0c\u5176\u4e2d\u4e0d\u5305\u542b\u4efb\u4f55\u91cd\u590d\u7684\u4fe1\u606f\u3002"]}, {"cell_type": "code", "execution_count": 57, "id": "161ae1ad", "metadata": {"tags": []}, "outputs": [], "source": ["compression_retriever = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb.as_retriever(search_type = \"mmr\")\n", ")"]}, {"cell_type": "code", "execution_count": 58, "id": "cd6396bb", "metadata": {}, "outputs": [], "source": ["compression_retriever_chinese = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb_chinese.as_retriever(search_type = \"mmr\")\n", ")"]}, {"cell_type": "code", "execution_count": 59, "id": "e77ccae1", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 2:\n", "\n", "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n"]}], "source": ["question = \"what did they say about matlab?\"\n", "compressed_docs = compression_retriever.get_relevant_documents(question)\n", "pretty_print_docs(compressed_docs)"]}, {"cell_type": "code", "execution_count": 60, "id": "fe68a14b", "metadata": {}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Request failed due to server shutdown {\n", " \"error\": {\n", " \"message\": \"Request failed due to server shutdown\",\n", " \"type\": \"server_error\",\n", " \"param\": null,\n", " \"code\": null\n", " }\n", "}\n", " 500 {'error': {'message': 'Request failed due to server shutdown', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Sun, 16 Jul 2023 05:28:06 GMT', 'Content-Type': 'application/json', 'Content-Length': '141', 'Connection': 'keep-alive', 'access-control-allow-origin': '*', 'openai-model': 'text-davinci-003', 'openai-organization': 'user-xnghkpntwvm31crtmex7n2j0', 'openai-processing-ms': '1159', 'openai-version': '2020-10-01', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-limit-requests': '3000', 'x-ratelimit-limit-tokens': '250000', 'x-ratelimit-remaining-requests': '2999', 'x-ratelimit-remaining-tokens': '249744', 'x-ratelimit-reset-requests': '20ms', 'x-ratelimit-reset-tokens': '61ms', 'x-request-id': '22b2986140ddb5bcc689423e056b7daf', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '7e77d6f7590b1ec8-NRT', 'alt-svc': 'h3=\":443\"; ma=86400'}.\n"]}, {"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "Matplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\n"]}], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "compressed_docs_chinese = compression_retriever_chinese.get_relevant_documents(question_chinese)\n", "pretty_print_docs(compressed_docs_chinese)"]}, {"attachments": {}, "cell_type": "markdown", "id": "6c2b63e1", "metadata": {}, "source": ["## \u4e09\u3001\u5176\u4ed6\u7c7b\u578b\u7684\u68c0\u7d22"]}, {"attachments": {}, "cell_type": "markdown", "id": "3e777a7b", "metadata": {}, "source": ["\u503c\u5f97\u6ce8\u610f\u7684\u662f\uff0cvetordb\u5e76\u4e0d\u662f\u552f\u4e00\u4e00\u79cd\u68c0\u7d22\u6587\u6863\u7684\u5de5\u5177\u3002\n", "\n", "`LangChain`\u68c0\u7d22\u5668\u62bd\u8c61\u5305\u62ec\u5176\u4ed6\u68c0\u7d22\u6587\u6863\u7684\u65b9\u5f0f\uff0c\u5982\uff1a`TF-IDF` \u6216 `SVM`\u3002"]}, {"cell_type": "code", "execution_count": 61, "id": "83d2e808", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.retrievers import SVMRetriever\n", "from langchain.retrievers import TFIDFRetriever\n", "from langchain.document_loaders import PyPDFLoader\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter"]}, {"cell_type": "code", "execution_count": 62, "id": "bcf5b760", "metadata": {"tags": []}, "outputs": [], "source": ["# \u52a0\u8f7dPDF\n", "loader = PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\")\n", "pages = loader.load()\n", "all_page_text = [p.page_content for p in pages]\n", "joined_page_text = \" \".join(all_page_text)\n", "\n", "# \u5206\u5272\u6587\u672c\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)\n", "splits = text_splitter.split_text(joined_page_text)\n"]}, {"cell_type": "code", "execution_count": 77, "id": "1657e768", "metadata": {}, "outputs": [], "source": ["# \u52a0\u8f7dPDF\n", "loader_chinese = PyPDFLoader(\"docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf\")\n", "pages_chinese = loader_chinese.load()\n", "all_page_text_chinese = [p.page_content for p in pages_chinese]\n", "joined_page_text_chinese = \" \".join(all_page_text_chinese)\n", "\n", "# \u5206\u5272\u6587\u672c\n", "text_splitter_chinese = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)\n", "splits_chinese = text_splitter_chinese.split_text(joined_page_text_chinese)"]}, {"cell_type": "code", "execution_count": 64, "id": "59abbaff", "metadata": {}, "outputs": [], "source": ["# \u68c0\u7d22\n", "svm_retriever = SVMRetriever.from_texts(splits, embedding)\n", "tfidf_retriever = TFIDFRetriever.from_texts(splits)"]}, {"cell_type": "code", "execution_count": 66, "id": "7885389e", "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content=\"let me just check what questions you have righ t now. So if there are no questions, I'll just \\nclose with two reminders, which are after class today or as you start to talk with other \\npeople in this class, I just encourage you again to start to form project partners, to try to \\nfind project partners to do your project with. And also, this is a good time to start forming \\nstudy groups, so either talk to your friends or post in the newsgroup, but we just \\nencourage you to try to star t to do both of those today, okay? Form study groups, and try \\nto find two other project partners. \\nSo thank you. I'm looking forward to teaching this class, and I'll see you in a couple of \\ndays. [End of Audio] \\nDuration: 69 minutes\", metadata={})"]}, "execution_count": 66, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"What are major topics for this class?\" # \u8fd9\u95e8\u8bfe\u7684\u4e3b\u8981\u4e3b\u9898\u662f\u4ec0\u4e48\uff1f\n", "docs_svm = svm_retriever.get_relevant_documents(question)\n", "docs_svm[0]"]}, {"cell_type": "code", "execution_count": 67, "id": "2a1659c0", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["Document(page_content=\"Saxena and Min Sun here did, wh ich is given an image like this, right? This is actually a \\npicture taken of the Stanford campus. You can apply that sort of cl ustering algorithm and \\ngroup the picture into regions. Let me actually blow that up so that you can see it more \\nclearly. Okay. So in the middle, you see the lines sort of groupi ng the image together, \\ngrouping the image into [inaudible] regions. \\nAnd what Ashutosh and Min did was they then applied the learning algorithm to say can \\nwe take this clustering and us e it to build a 3D model of the world? And so using the \\nclustering, they then had a lear ning algorithm try to learn what the 3D structure of the \\nworld looks like so that they could come up with a 3D model that you can sort of fly \\nthrough, okay? Although many people used to th ink it's not possible to take a single \\nimage and build a 3D model, but using a lear ning algorithm and that sort of clustering \\nalgorithm is the first step. They were able to. \\nI'll just show you one more example. I like this because it's a picture of Stanford with our \\nbeautiful Stanford campus. So again, taking th e same sort of clustering algorithms, taking \\nthe same sort of unsupervised learning algor ithm, you can group the pixels into different \\nregions. And using that as a pre-processing step, they eventually built this sort of 3D model of Stanford campus in a single picture. You can sort of walk into the ceiling, look\", metadata={})"]}, "execution_count": 67, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"what did they say about matlab?\" # \u4ed6\u4eec\u5173\u4e8eMatlab\u8bf4\u4e86\u4e9b\u4ec0\u4e48\uff1f\n", "docs_tfidf = tfidf_retriever.get_relevant_documents(question)\n", "docs_tfidf[0]"]}, {"cell_type": "code", "execution_count": 78, "id": "5ade9c87", "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "svm_retriever_chinese = SVMRetriever.from_texts(splits_chinese, embedding)\n", "tfidf_retriever_chinese = TFIDFRetriever.from_texts(splits_chinese)"]}, {"cell_type": "code", "execution_count": 79, "id": "cc823bea", "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content='fig, ax = plt.subplots() \\n# step4 \u7ed8\u5236\u56fe\u50cf\uff0c \u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u2f06\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.plot(x, y, label=\\'linear\\') \\n# step5 \u6dfb\u52a0\u6807\u7b7e\uff0c\u2f42\u5b57\u548c\u56fe\u4f8b\uff0c\u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u56db\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.set_xlabel(\\'x label\\') \\nax.set_ylabel(\\'y label\\') \\nax.set_title(\"Simple Plot\") \\nax.legend() ;\\n\u601d\u8003\u9898\\n\u8bf7\u601d\u8003\u4e24\u79cd\u7ed8\u56fe\u6a21\u5f0f\u7684\u4f18\u7f3a\u70b9\u548c\u5404\u2f83\u9002\u5408\u7684\u4f7f\u2f64\u573a\u666f\\n\u5728\u7b2c\u4e94\u8282\u7ed8\u56fe\u6a21\u677f\u4e2d\u6211\u4eec\u662f\u4ee5 OO \u6a21\u5f0f\u4f5c\u4e3a\u4f8b\u2f26\u5c55\u793a\u7684\uff0c\u8bf7\u601d\u8003\u5e76\u5199\u2f00\u4e2a pyplot \u7ed8\u56fe\u6a21\u5f0f\u7684\u7b80\u5355\u6a21\u677f', metadata={})"]}, "execution_count": 79, "metadata": {}, "output_type": "execute_result"}], "source": ["question_chinese = \"\u8fd9\u95e8\u8bfe\u7684\u4e3b\u8981\u4e3b\u9898\u662f\u4ec0\u4e48\uff1f\" \n", "docs_svm_chinese = svm_retriever_chinese.get_relevant_documents(question_chinese)\n", "docs_svm_chinese[0]"]}, {"cell_type": "code", "execution_count": 80, "id": "01eb9d43", "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content='fig, ax = plt.subplots() \\n# step4 \u7ed8\u5236\u56fe\u50cf\uff0c \u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u2f06\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.plot(x, y, label=\\'linear\\') \\n# step5 \u6dfb\u52a0\u6807\u7b7e\uff0c\u2f42\u5b57\u548c\u56fe\u4f8b\uff0c\u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u56db\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.set_xlabel(\\'x label\\') \\nax.set_ylabel(\\'y label\\') \\nax.set_title(\"Simple Plot\") \\nax.legend() ;\\n\u601d\u8003\u9898\\n\u8bf7\u601d\u8003\u4e24\u79cd\u7ed8\u56fe\u6a21\u5f0f\u7684\u4f18\u7f3a\u70b9\u548c\u5404\u2f83\u9002\u5408\u7684\u4f7f\u2f64\u573a\u666f\\n\u5728\u7b2c\u4e94\u8282\u7ed8\u56fe\u6a21\u677f\u4e2d\u6211\u4eec\u662f\u4ee5 OO \u6a21\u5f0f\u4f5c\u4e3a\u4f8b\u2f26\u5c55\u793a\u7684\uff0c\u8bf7\u601d\u8003\u5e76\u5199\u2f00\u4e2a pyplot \u7ed8\u56fe\u6a21\u5f0f\u7684\u7b80\u5355\u6a21\u677f', metadata={})"]}, "execution_count": 80, "metadata": {}, "output_type": "execute_result"}], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "docs_tfidf_chinese = tfidf_retriever_chinese.get_relevant_documents(question_chinese)\n", "docs_tfidf_chinese[0]"]}], "metadata": {"kernelspec": {"display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11"}}, "nbformat": 4, "nbformat_minor": 5}