diff --git a/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_2e82e9e8-21b4-417f-9108-7aef665dd192.pkl b/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_2e82e9e8-21b4-417f-9108-7aef665dd192.pkl new file mode 100644 index 0000000..b6f0c4f Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_2e82e9e8-21b4-417f-9108-7aef665dd192.pkl differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_929a3c06-e745-4005-beba-6d0b93e399fe.pkl b/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_929a3c06-e745-4005-beba-6d0b93e399fe.pkl new file mode 100644 index 0000000..71f6b32 Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_929a3c06-e745-4005-beba-6d0b93e399fe.pkl differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_d2286c4f-c613-4617-bede-678577318a21.pkl b/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_d2286c4f-c613-4617-bede-678577318a21.pkl new file mode 100644 index 0000000..0592b8d Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_d2286c4f-c613-4617-bede-678577318a21.pkl differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_f4d0527f-8207-4f46-b2bf-b0502252f3d3.pkl b/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_f4d0527f-8207-4f46-b2bf-b0502252f3d3.pkl new file mode 100644 index 0000000..92c72bf Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/id_to_uuid_f4d0527f-8207-4f46-b2bf-b0502252f3d3.pkl differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/index_2e82e9e8-21b4-417f-9108-7aef665dd192.bin b/content/LangChain Chat with Your Data/.chroma/index/index_2e82e9e8-21b4-417f-9108-7aef665dd192.bin new file mode 100644 index 0000000..b08251e Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/index_2e82e9e8-21b4-417f-9108-7aef665dd192.bin differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/index_929a3c06-e745-4005-beba-6d0b93e399fe.bin b/content/LangChain Chat with Your Data/.chroma/index/index_929a3c06-e745-4005-beba-6d0b93e399fe.bin new file mode 100644 index 0000000..b08251e Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/index_929a3c06-e745-4005-beba-6d0b93e399fe.bin differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/index_d2286c4f-c613-4617-bede-678577318a21.bin b/content/LangChain Chat with Your Data/.chroma/index/index_d2286c4f-c613-4617-bede-678577318a21.bin new file mode 100644 index 0000000..433295c Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/index_d2286c4f-c613-4617-bede-678577318a21.bin differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/index_f4d0527f-8207-4f46-b2bf-b0502252f3d3.bin b/content/LangChain Chat with Your Data/.chroma/index/index_f4d0527f-8207-4f46-b2bf-b0502252f3d3.bin new file mode 100644 index 0000000..e054acd Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/index_f4d0527f-8207-4f46-b2bf-b0502252f3d3.bin differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/index_metadata_2e82e9e8-21b4-417f-9108-7aef665dd192.pkl b/content/LangChain Chat with Your Data/.chroma/index/index_metadata_2e82e9e8-21b4-417f-9108-7aef665dd192.pkl new file mode 100644 index 0000000..4452b5c Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/index_metadata_2e82e9e8-21b4-417f-9108-7aef665dd192.pkl differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/index_metadata_929a3c06-e745-4005-beba-6d0b93e399fe.pkl b/content/LangChain Chat with Your Data/.chroma/index/index_metadata_929a3c06-e745-4005-beba-6d0b93e399fe.pkl new file mode 100644 index 0000000..611b98c Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/index_metadata_929a3c06-e745-4005-beba-6d0b93e399fe.pkl differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/index_metadata_d2286c4f-c613-4617-bede-678577318a21.pkl b/content/LangChain Chat with Your Data/.chroma/index/index_metadata_d2286c4f-c613-4617-bede-678577318a21.pkl new file mode 100644 index 0000000..83d146a Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/index_metadata_d2286c4f-c613-4617-bede-678577318a21.pkl differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/index_metadata_f4d0527f-8207-4f46-b2bf-b0502252f3d3.pkl b/content/LangChain Chat with Your Data/.chroma/index/index_metadata_f4d0527f-8207-4f46-b2bf-b0502252f3d3.pkl new file mode 100644 index 0000000..f720fe3 Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/index_metadata_f4d0527f-8207-4f46-b2bf-b0502252f3d3.pkl differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_2e82e9e8-21b4-417f-9108-7aef665dd192.pkl b/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_2e82e9e8-21b4-417f-9108-7aef665dd192.pkl new file mode 100644 index 0000000..d3819ed Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_2e82e9e8-21b4-417f-9108-7aef665dd192.pkl differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_929a3c06-e745-4005-beba-6d0b93e399fe.pkl b/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_929a3c06-e745-4005-beba-6d0b93e399fe.pkl new file mode 100644 index 0000000..36d742a Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_929a3c06-e745-4005-beba-6d0b93e399fe.pkl differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_d2286c4f-c613-4617-bede-678577318a21.pkl b/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_d2286c4f-c613-4617-bede-678577318a21.pkl new file mode 100644 index 0000000..8994203 Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_d2286c4f-c613-4617-bede-678577318a21.pkl differ diff --git a/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_f4d0527f-8207-4f46-b2bf-b0502252f3d3.pkl b/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_f4d0527f-8207-4f46-b2bf-b0502252f3d3.pkl new file mode 100644 index 0000000..4b04380 Binary files /dev/null and b/content/LangChain Chat with Your Data/.chroma/index/uuid_to_id_f4d0527f-8207-4f46-b2bf-b0502252f3d3.pkl differ diff --git a/content/LangChain Chat with Your Data/5.检索 retrieval.ipynb b/content/LangChain Chat with Your Data/5.检索 retrieval.ipynb index ab07e26..6220011 100644 --- a/content/LangChain Chat with Your Data/5.检索 retrieval.ipynb +++ b/content/LangChain Chat with Your Data/5.检索 retrieval.ipynb @@ -1,1657 +1 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "0689733d", - "metadata": {}, - "source": [ - "# 第五章 检索(Retrieval)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "d12fbd74", - "metadata": {}, - "source": [ - "\n", - "检索是我们的检索增强生成(RAG)流程的核心。\n", - "\n", - "让我们获得前面课程存储的向量数据库(`VectorDB`)。" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "ed2569c6", - "metadata": {}, - "source": [ - "## 一、向量数据库检索" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "651a10db", - "metadata": {}, - "source": [ - "在当前文件夹下新建`.env`文件,内容为`OPENAI_API_KEY = \"sk-...\"`\n", - "\n", - "本章节需要使用`lark`包,运行以下命令安装" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "c18f8a7b-62af-403e-9877-18d1c2265b4f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!pip install -Uq lark" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "51b15e5c-9b92-4d40-a149-b56335d330d9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import os\n", - "import openai\n", - "import sys\n", - "sys.path.append('../..')\n", - "\n", - "from dotenv import load_dotenv, find_dotenv\n", - "_ = load_dotenv(find_dotenv()) # read local .env file\n", - "\n", - "openai.api_key = os.environ['OPENAI_API_KEY']\n", - "os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'\n", - "os.environ[\"HTTP_PROXY\"] = 'http://127.0.0.1:7890'" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "c2d552e1", - "metadata": {}, - "source": [ - "### 1.1 相似性搜索" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "fe368042", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.vectorstores import Chroma\n", - "from langchain.embeddings.openai import OpenAIEmbeddings\n", - "persist_directory = 'docs/chroma/cs229_lectures/'\n", - "persist_directory_chinese = 'docs/chroma/matplotlib/'" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "5ba63e21", - "metadata": {}, - "source": [ - "将上节课所保存的向量数据库(`VectorDB`)加载进来" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a0189dc5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "embedding = OpenAIEmbeddings()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "2be10170", - "metadata": {}, - "outputs": [], - "source": [ - "vectordb = Chroma(\n", - " persist_directory=persist_directory,\n", - " embedding_function=embedding\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "3659e0f7", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n" - ] - } - ], - "source": [ - "print(vectordb._collection.count())" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "a01ab000", - "metadata": {}, - "outputs": [], - "source": [ - "vectordb_chinese = Chroma(\n", - " persist_directory=persist_directory_chinese,\n", - " embedding_function=embedding\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "a6998a03", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "27\n" - ] - } - ], - "source": [ - "print(vectordb_chinese._collection.count())" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "9ae4fdd8", - "metadata": {}, - "source": [ - "让我们现在来看看最大边际相关性的例子。因此,我们将从下面示例中加载有关蘑菇的信息。\n", - "\n", - "让我们现在运行它与MMR。让我们传入k等于2。我们仍然希望返回两个文档,但让我们设置获取k等于3,其中我们最初获取所有三个文档。我们现在可以看到,我们检索的文档中返回了有毒的信息。" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "a807c758", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "texts = [\n", - " \"\"\"The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).\"\"\",\n", - " \"\"\"A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.\"\"\",\n", - " \"\"\"A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.\"\"\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "b110cceb", - "metadata": {}, - "outputs": [], - "source": [ - "texts_chinese = [\n", - " \"\"\"毒鹅膏菌(Amanita phalloides)具有大型且引人注目的地上(epigeous)子实体(basidiocarp)\"\"\",\n", - " \"\"\"一种具有大型子实体的蘑菇是毒鹅膏菌(Amanita phalloides)。某些品种全白。\"\"\",\n", - " \"\"\"A. phalloides,又名死亡帽,是已知所有蘑菇中最有毒的一种。\"\"\",\n", - "]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "84cd5f1c", - "metadata": {}, - "source": [ - "对于这个例子,我们将创建一个小数据库,我们可以作为一个示例来使用。" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "715d54f3", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "smalldb = Chroma.from_texts(texts, embedding=embedding)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "305e1714", - "metadata": {}, - "outputs": [], - "source": [ - "smalldb_chinese = Chroma.from_texts(texts_chinese, embedding=embedding)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "239a8d95", - "metadata": {}, - "source": [ - "下面是我们对于这个示例所提出的问题" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "9a37b5a5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "question = \"Tell me about all-white mushrooms with large fruiting bodies\"" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "92312e57", - "metadata": {}, - "outputs": [], - "source": [ - "question_chinese = \"告诉我关于具有大型子实体的全白色蘑菇的信息\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "d3224a6d", - "metadata": {}, - "source": [ - "现在,我们可以运行一个相似性搜索,设置k=2,只返回两个最相关的文档。\n", - "\n", - "我们可以看到,没有提到它是有毒的事实。" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "24e3b025", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),\n", - " Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).', metadata={})]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "smalldb.similarity_search(question, k=2)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "d4c5a47d", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - }, - { - "data": { - "text/plain": [ - "[Document(page_content='一种具有大型子实体的蘑菇是毒鹅膏菌(Amanita phalloides)。某些品种全白。', metadata={}),\n", - " Document(page_content='毒鹅膏菌(Amanita phalloides)具有大型且引人注目的地上(epigeous)子实体(basidiocarp)', metadata={})]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "smalldb_chinese.similarity_search(question_chinese, k=2)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "bbb0ea94", - "metadata": {}, - "source": [ - "现在,让我们运行最大边际相关性(MMR)。\n", - "\n", - "设置k=2,因为我们仍然希望返回两个文档。设置fetch_k=3,fetch_k是我们最初获取的所有文档(3个)。" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "4daa6c0d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - }, - { - "data": { - "text/plain": [ - "[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),\n", - " Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.', metadata={})]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "e15521d2", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - }, - { - "data": { - "text/plain": [ - "[Document(page_content='一种具有大型子实体的蘑菇是毒鹅膏菌(Amanita phalloides)。某些品种全白。', metadata={}),\n", - " Document(page_content='A. phalloides,又名死亡帽,是已知所有蘑菇中最有毒的一种。', metadata={})]" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "smalldb_chinese.max_marginal_relevance_search(question,k=2, fetch_k=3)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "e87c5f91", - "metadata": {}, - "source": [ - "我们现在可以看到,我们检索的文档中返回了有毒的信息。" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "5a29e8c9", - "metadata": {}, - "source": [ - "### 1.2 解决多样性:最大边际相关性(MMR)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "a2b5c4ae", - "metadata": {}, - "source": [ - "\n", - "我们刚刚通过一个示例引出了一个问题:如何加强搜索结果的多样性。\n", - " \n", - "最大边际相关性(`Maximum marginal relevance`)试图在查询的相关性和结果的多样性之间实现两全其美。" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "2360545c", - "metadata": {}, - "source": [ - "让我们回到上一节课的一个例子,当我们通过问题对向量数据库进行相似性搜索后\n", - "\n", - "我们可以看看前两个文档,只看前几个字符,可以看到它们是相同的。" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "9bb2c0a9", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - } - ], - "source": [ - "question = \"what did they say about matlab?\"\n", - "docs_ss = vectordb.similarity_search(question,k=3)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "f07f8793", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \\nknow some people '" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs_ss[0].page_content[:100]" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "e9f7e165", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \\nknow some people '" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs_ss[1].page_content[:100]" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "e8e142eb", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - } - ], - "source": [ - "question_chinese = \"Matplotlib是什么?\"\n", - "docs_ss_chinese = vectordb_chinese.similarity_search(question_chinese,k=3)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "cf642f66", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'第⼀回:Matplotlib 初相识\\n⼀、认识matplotlib\\nMatplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种'" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs_ss_chinese[0].page_content[:100]" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "1e9f5cfe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'第⼀回:Matplotlib 初相识\\n⼀、认识matplotlib\\nMatplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种'" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs_ss_chinese[1].page_content[:100]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "4c4ca1b6", - "metadata": {}, - "source": [ - "注意:使用`MMR`所得出结果的差异。" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "222234c5", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - } - ], - "source": [ - "docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "408935bc", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - } - ], - "source": [ - "docs_mmr_chinese = vectordb_chinese.max_marginal_relevance_search(question_chinese,k=3)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "9076db81", - "metadata": {}, - "source": [ - "当我们运行MMR后得到结果时,我们可以看到第一个与之前的相同,因为那是最相似的。" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "93b20226", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \\nknow some people '" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs_mmr[0].page_content[:100]" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "d0acfaab", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'第⼀回:Matplotlib 初相识\\n⼀、认识matplotlib\\nMatplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种'" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs_mmr_chinese[0].page_content[:100]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "7a93743f", - "metadata": {}, - "source": [ - "但是当我们进行到第二个时,我们可以看到它是不同的。\n", - "\n", - "它在回应中获得了一些多样性。" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "17d39762", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "\"mathematical work, he feels like he's disc overing truth and beauty in the universe. And \\nhe says it\"" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs_mmr[1].page_content[:100]" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "93d3206c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'By Datawhale 数据可视化开源⼩组\\n© Copyright © Copyright 2021.y轴分为左右两个,因此 tick1 对应左侧的轴; tick2 对应右侧的轴。\\nx轴分为上下两个'" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs_mmr_chinese[1].page_content[:100]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "b2b909bc", - "metadata": {}, - "source": [ - "### 1.3 解决特殊性:使用元数据" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "7b63c5ee", - "metadata": {}, - "source": [ - "\n", - "在上一节课中,我们展示了一个问题,是询问了关于文档中某一讲的问题,但得到的结果中也包括了来自其他讲的结果。\n", - "\n", - "为了解决这一问题,很多向量数据库都支持对`metadata`的操作。\n", - "\n", - "`metadata`为每个嵌入的块(embedded chunk)提供上下文。" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "3c1a60b2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "question = \"what did they say about regression in the third lecture?\"" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "ba98df3c", - "metadata": {}, - "outputs": [], - "source": [ - "question_chinese = \"他们在第二讲中对Figure说了些什么?\" " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3873525e", - "metadata": {}, - "source": [ - "现在,我们以手动的方式来解决这个问题,我们会指定一个元数据过滤器`filter`" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "a8612840", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - } - ], - "source": [ - "docs = vectordb.similarity_search(\n", - " question,\n", - " k=3,\n", - " filter={\"source\":\"docs/cs229_lectures/MachineLearning-Lecture03.pdf\"}\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "b46c7e76", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.embeddings.openai.embed_with_retry.._embed_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - } - ], - "source": [ - "docs_chinese = vectordb_chinese.similarity_search(\n", - " question_chinese,\n", - " k=3,\n", - " filter={\"source\":\"docs/matplotlib/第二回:艺术画笔见乾坤.pdf\"}\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "869aee28", - "metadata": {}, - "source": [ - "接下来,我们可以看到结果都来自对应的章节" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "97031876", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}\n", - "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}\n", - "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 4}\n" - ] - } - ], - "source": [ - "for d in docs:\n", - " print(d.metadata)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "2708f6ae", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf', 'page': 9}\n", - "{'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf', 'page': 10}\n", - "{'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf', 'page': 0}\n" - ] - } - ], - "source": [ - "for d in docs_chinese:\n", - " print(d.metadata)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "5e299f8e", - "metadata": {}, - "source": [ - "当然,我们不能每次都采用手动的方式来解决这个问题,这会显得不够智能\n", - "\n", - "下一小节将要展示通过LLM来解决这个问题" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "ccc2d784", - "metadata": {}, - "source": [ - "### 1.4 解决特殊性:在元数据中使用自查询检索器" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "82ef44b6", - "metadata": {}, - "source": [ - "我们有一个有趣的挑战:我们通常希望从查询本身来推断元数据。\n", - "\n", - "为了解决这个问题,我们可以使用SelfQueryRetriever,它使用LLM来提取:\n", - " \n", - "1. 用于向量搜索的查询(`query`)字符串,即:问题\n", - "2. 要一起传入的元数据过滤器\n", - "\n", - "大多数向量数据库支持元数据过滤器,因此不需要任何新的数据库及索引。" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "b1d06084", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.llms import OpenAI\n", - "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", - "from langchain.chains.query_constructor.base import AttributeInfo" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "869c27c0", - "metadata": {}, - "outputs": [], - "source": [ - "llm = OpenAI(temperature=0)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "acd194c5", - "metadata": {}, - "source": [ - "`AttributeInfo`是我们可以指定元数据中的不同字段以及它们对应的位置。\n", - "\n", - "在元数据中,我们只有两个字段,`source`和`page`。\n", - "\n", - "我们将填写每个属性的名称、描述和类型的描述。\n", - "\n", - "这些信息实际上将被传递给LLM,所以需要尽可能详细地描述。" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "0aa5e698", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "metadata_field_info = [\n", - " AttributeInfo(\n", - " name=\"source\",\n", - " description=\"The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`\",\n", - " type=\"string\",\n", - " ),\n", - " AttributeInfo(\n", - " name=\"page\",\n", - " description=\"The page from the lecture\",\n", - " type=\"integer\",\n", - " ),\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "544ad7c1", - "metadata": {}, - "outputs": [], - "source": [ - "metadata_field_info_chinese = [\n", - " AttributeInfo(\n", - " name=\"source\",\n", - " description=\"讲义来源于 `docs/matplotlib/第一回:Matplotlib初相识.pdf`, `docs/matplotlib/第二回:艺术画笔见乾坤.pdf`, or `docs/matplotlib/第三回:布局格式定方圆.pdf` 的其中之一\",\n", - " type=\"string\",\n", - " ),\n", - " AttributeInfo(\n", - " name=\"page\",\n", - " description=\"讲义的那一页\",\n", - " type=\"integer\",\n", - " ),\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "e7906c15", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "document_content_description = \"Lecture notes\"\n", - "retriever = SelfQueryRetriever.from_llm(\n", - " llm,\n", - " vectordb,\n", - " document_content_description,\n", - " metadata_field_info,\n", - " verbose=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "d5b99571", - "metadata": {}, - "outputs": [], - "source": [ - "document_content_description_chinese = \"课堂讲义\"\n", - "retriever_chinese = SelfQueryRetriever.from_llm(\n", - " llm,\n", - " vectordb_chinese,\n", - " document_content_description_chinese,\n", - " metadata_field_info_chinese,\n", - " verbose=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "79d781b9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "question = \"what did they say about regression in the third lecture?\"" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "8d9b7e18", - "metadata": {}, - "outputs": [], - "source": [ - "question_chinese = \"他们在第二讲中对Figure说了些什么?\" " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "c51778b0-1fcd-40a4-bd6b-0f13fec8acb1", - "metadata": {}, - "source": [ - "当你第一次执行下一行时,你会收到关于predict_and_parse已被弃用的**警告**。 这可以安全地忽略。" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "1d4f9f7d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "query='regression' filter=Comparison(comparator=, attribute='source', value='docs/cs229_lectures/MachineLearning-Lecture03.pdf') limit=None\n" - ] - } - ], - "source": [ - "docs = retriever.get_relevant_documents(question)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "ea39a97e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "query='Figure' filter=Comparison(comparator=, attribute='source', value='docs/matplotlib/第二回:艺术画笔见乾坤.pdf') limit=None\n" - ] - } - ], - "source": [ - "docs_chinese = retriever_chinese.get_relevant_documents(question_chinese)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "db04374e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}\n", - "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}\n", - "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 10}\n", - "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 10}\n" - ] - } - ], - "source": [ - "for d in docs:\n", - " print(d.metadata)" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "143061f5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf', 'page': 9}\n", - "{'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf', 'page': 10}\n", - "{'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf', 'page': 0}\n", - "{'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf', 'page': 6}\n" - ] - } - ], - "source": [ - "for d in docs_chinese:\n", - " print(d.metadata)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "297b8168", - "metadata": {}, - "source": [ - "### 1.5 其他技巧:压缩" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "564144da", - "metadata": {}, - "source": [ - "另一种提高检索到的文档质量的方法是压缩。\n", - "\n", - "与查询最相关的信息可能隐藏在具有大量不相关文本的文档中。\n", - "\n", - "在应用程序中传递完整的文档可能会导致更昂贵的LLM调用和更差的响应。\n", - "\n", - "上下文压缩就是为了解决这个问题。" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "a060cf74", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.retrievers import ContextualCompressionRetriever\n", - "from langchain.retrievers.document_compressors import LLMChainExtractor" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "038649c8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def pretty_print_docs(docs):\n", - " print(f\"\\n{'-' * 100}\\n\".join([f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]))" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "fc686cf2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "llm = OpenAI(temperature=0)\n", - "compressor = LLMChainExtractor.from_llm(llm) # 压缩器" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "82794397", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "compression_retriever = ContextualCompressionRetriever(\n", - " base_compressor=compressor,\n", - " base_retriever=vectordb.as_retriever()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "915598f8", - "metadata": {}, - "outputs": [], - "source": [ - "compression_retriever_chinese = ContextualCompressionRetriever(\n", - " base_compressor=compressor,\n", - " base_retriever=vectordb_chinese.as_retriever()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "cde86848", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Document 1:\n", - "\n", - "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", - "----------------------------------------------------------------------------------------------------\n", - "Document 2:\n", - "\n", - "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", - "----------------------------------------------------------------------------------------------------\n", - "Document 3:\n", - "\n", - "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n", - "----------------------------------------------------------------------------------------------------\n", - "Document 4:\n", - "\n", - "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n" - ] - } - ], - "source": [ - "question = \"what did they say about matlab?\"\n", - "compressed_docs = compression_retriever.get_relevant_documents(question)\n", - "pretty_print_docs(compressed_docs)" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "39726b24", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Document 1:\n", - "\n", - "Matplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种静态,动态,交互式的图表。\n", - "----------------------------------------------------------------------------------------------------\n", - "Document 2:\n", - "\n", - "Matplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种静态,动态,交互式的图表。\n" - ] - } - ], - "source": [ - "question_chinese = \"Matplotlib是什么?\"\n", - "compressed_docs_chinese = compression_retriever_chinese.get_relevant_documents(question_chinese)\n", - "pretty_print_docs(compressed_docs_chinese)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "049b2601", - "metadata": {}, - "source": [ - "现在当我们提出问题后,查看结果文档\n", - "\n", - "我们可以看到两件事。\n", - "\n", - "1. 它们比正常文档短很多\n", - "2. 仍然有一些重复的东西,这是因为在底层我们使用的是语义搜索算法。\n", - "\n", - "这就是我们在本课程前面使用MMR解决的问题。\n", - "\n", - "这是一个很好的例子,你可以结合各种技术得到最好的可能结果。" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "82c4fc4d", - "metadata": {}, - "source": [ - "## 二、结合各种技术" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "54432975", - "metadata": {}, - "source": [ - "为了做到这一点,我们在从向量数据库创建检索器时,可以将搜索类型设置为MMR。\n", - "\n", - "然后我们可以重新运行这个过程,看到我们返回的是一个过滤过的结果集,其中不包含任何重复的信息。" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "161ae1ad", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "compression_retriever = ContextualCompressionRetriever(\n", - " base_compressor=compressor,\n", - " base_retriever=vectordb.as_retriever(search_type = \"mmr\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "cd6396bb", - "metadata": {}, - "outputs": [], - "source": [ - "compression_retriever_chinese = ContextualCompressionRetriever(\n", - " base_compressor=compressor,\n", - " base_retriever=vectordb_chinese.as_retriever(search_type = \"mmr\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "e77ccae1", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", - "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-6XQxEC6cDJisHUyyNTit088C on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Document 1:\n", - "\n", - "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", - "----------------------------------------------------------------------------------------------------\n", - "Document 2:\n", - "\n", - "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n" - ] - } - ], - "source": [ - "question = \"what did they say about matlab?\"\n", - "compressed_docs = compression_retriever.get_relevant_documents(question)\n", - "pretty_print_docs(compressed_docs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fe68a14b", - "metadata": {}, - "outputs": [], - "source": [ - "question_chinese = \"Matplotlib是什么?\"\n", - "compressed_docs_chinese = compression_retriever_chinese.get_relevant_documents(question_chinese)\n", - "pretty_print_docs(compressed_docs_chinese)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "6c2b63e1", - "metadata": {}, - "source": [ - "## 三、其他类型的检索" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3e777a7b", - "metadata": {}, - "source": [ - "值得注意的是,vetordb并不是唯一一种检索文档的工具。\n", - "\n", - "`LangChain`检索器抽象包括其他检索文档的方式,如:`TF-IDF` 或 `SVM`。" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "83d2e808", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.retrievers import SVMRetriever\n", - "from langchain.retrievers import TFIDFRetriever\n", - "from langchain.document_loaders import PyPDFLoader\n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "bcf5b760", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# 加载PDF\n", - "loader = PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\")\n", - "pages = loader.load()\n", - "all_page_text = [p.page_content for p in pages]\n", - "joined_page_text = \" \".join(all_page_text)\n", - "\n", - "# 分割文本\n", - "text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)\n", - "splits = text_splitter.split_text(joined_page_text)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1657e768", - "metadata": {}, - "outputs": [], - "source": [ - "# 加载PDF\n", - "loader_chinese = PyPDFLoader(\"docs/matplotlib/第一回:Matplotlib初相识.pdf\")\n", - "pages_chinese = loader_chinese.load()\n", - "all_page_text_chinese = [p.page_content for p in pages]\n", - "joined_page_text_chinese = \" \".join(all_page_text)\n", - "\n", - "# 分割文本\n", - "text_splitter_chinese = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)\n", - "splits_chinese = text_splitter_chinese.split_text(joined_page_text_chinese)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59abbaff", - "metadata": {}, - "outputs": [], - "source": [ - "# 检索\n", - "svm_retriever = SVMRetriever.from_texts(splits, embedding)\n", - "tfidf_retriever = TFIDFRetriever.from_texts(splits)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9bb0d781", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# 检索\n", - "svm_retriever_splitter = SVMRetriever.from_texts(splits_chinese, embedding)\n", - "tfidf_retriever_splitter = TFIDFRetriever.from_texts(splits_chinese)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7885389e", - "metadata": {}, - "outputs": [], - "source": [ - "question = \"What are major topics for this class?\" # 这门课的主要主题是什么?\n", - "docs_svm = svm_retriever.get_relevant_documents(question)\n", - "docs_svm[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a1659c0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "question = \"what did they say about matlab?\" # 他们关于Matlab说了些什么?\n", - "docs_tfidf = tfidf_retriever.get_relevant_documents(question)\n", - "docs_tfidf[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc823bea", - "metadata": {}, - "outputs": [], - "source": [ - "question_chinese = \"这门课的主要主题是什么?\" \n", - "docs_svm_chinese = svm_retriever_chinese.get_relevant_documents(question_chinese)\n", - "docs_svm_chinese[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01eb9d43", - "metadata": {}, - "outputs": [], - "source": [ - "question_chinese = \"Matplotlib是什么?\"\n", - "docs_tfidf_chinese = tfidf_retriever_chinese.get_relevant_documents(question_chinese)\n", - "docs_tfidf_chinese[0]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} +{"cells": [{"attachments": {}, "cell_type": "markdown", "id": "0689733d", "metadata": {}, "source": ["# \u7b2c\u4e94\u7ae0 \u68c0\u7d22(Retrieval)", "\n", " - [\u4e00\u3001\u5411\u91cf\u6570\u636e\u5e93\u68c0\u7d22](#\u4e00\u3001\u5411\u91cf\u6570\u636e\u5e93\u68c0\u7d22)\n", " - [1.1 \u76f8\u4f3c\u6027\u641c\u7d22](#1.1-\u76f8\u4f3c\u6027\u641c\u7d22)\n", " - [1.2 \u89e3\u51b3\u591a\u6837\u6027\uff1a\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR)](#1.2-\u89e3\u51b3\u591a\u6837\u6027\uff1a\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR))\n", " - [1.3 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u4f7f\u7528\u5143\u6570\u636e](#1.3-\u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u4f7f\u7528\u5143\u6570\u636e)\n", " - [1.4 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u5728\u5143\u6570\u636e\u4e2d\u4f7f\u7528\u81ea\u67e5\u8be2\u68c0\u7d22\u5668](#1.4-\u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u5728\u5143\u6570\u636e\u4e2d\u4f7f\u7528\u81ea\u67e5\u8be2\u68c0\u7d22\u5668)\n", " - [1.5 \u5176\u4ed6\u6280\u5de7\uff1a\u538b\u7f29](#1.5-\u5176\u4ed6\u6280\u5de7\uff1a\u538b\u7f29)\n", " - [\u4e8c\u3001\u7ed3\u5408\u5404\u79cd\u6280\u672f](#\u4e8c\u3001\u7ed3\u5408\u5404\u79cd\u6280\u672f)\n", " - [\u4e09\u3001\u5176\u4ed6\u7c7b\u578b\u7684\u68c0\u7d22](#\u4e09\u3001\u5176\u4ed6\u7c7b\u578b\u7684\u68c0\u7d22)\n"]}, {"attachments": {}, "cell_type": "markdown", "id": "d12fbd74", "metadata": {}, "source": ["\n", "\u68c0\u7d22\u662f\u6211\u4eec\u7684\u68c0\u7d22\u589e\u5f3a\u751f\u6210(RAG)\u6d41\u7a0b\u7684\u6838\u5fc3\u3002\n", "\n", "\u8ba9\u6211\u4eec\u83b7\u5f97\u524d\u9762\u8bfe\u7a0b\u5b58\u50a8\u7684\u5411\u91cf\u6570\u636e\u5e93(`VectorDB`)\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "ed2569c6", "metadata": {}, "source": ["## \u4e00\u3001\u5411\u91cf\u6570\u636e\u5e93\u68c0\u7d22"]}, {"attachments": {}, "cell_type": "markdown", "id": "651a10db", "metadata": {}, "source": ["\u5728\u5f53\u524d\u6587\u4ef6\u5939\u4e0b\u65b0\u5efa`.env`\u6587\u4ef6\uff0c\u5185\u5bb9\u4e3a`OPENAI_API_KEY = \"sk-...\"`\n", "\n", "\u672c\u7ae0\u8282\u9700\u8981\u4f7f\u7528`lark`\u5305\uff0c\u8fd0\u884c\u4ee5\u4e0b\u547d\u4ee4\u5b89\u88c5"]}, {"cell_type": "code", "execution_count": 1, "id": "c18f8a7b-62af-403e-9877-18d1c2265b4f", "metadata": {"tags": []}, "outputs": [], "source": ["!pip install -Uq lark"]}, {"cell_type": "code", "execution_count": 3, "id": "51b15e5c-9b92-4d40-a149-b56335d330d9", "metadata": {"tags": []}, "outputs": [], "source": ["import os\n", "import openai\n", "import sys\n", "sys.path.append('../..')\n", "\n", "from dotenv import load_dotenv, find_dotenv\n", "_ = load_dotenv(find_dotenv()) # read local .env file\n", "\n", "openai.api_key = os.environ['OPENAI_API_KEY']\n"]}, {"attachments": {}, "cell_type": "markdown", "id": "c2d552e1", "metadata": {}, "source": ["### 1.1 \u76f8\u4f3c\u6027\u641c\u7d22"]}, {"cell_type": "code", "execution_count": 4, "id": "fe368042", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.vectorstores import Chroma\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "persist_directory = 'docs/chroma/cs229_lectures/'\n", "persist_directory_chinese = 'docs/chroma/matplotlib/'"]}, {"attachments": {}, "cell_type": "markdown", "id": "5ba63e21", "metadata": {}, "source": ["\u5c06\u4e0a\u8282\u8bfe\u6240\u4fdd\u5b58\u7684\u5411\u91cf\u6570\u636e\u5e93(`VectorDB`)\u52a0\u8f7d\u8fdb\u6765"]}, {"cell_type": "code", "execution_count": 5, "id": "a0189dc5", "metadata": {"tags": []}, "outputs": [], "source": ["embedding = OpenAIEmbeddings()"]}, {"cell_type": "code", "execution_count": 6, "id": "2be10170", "metadata": {}, "outputs": [], "source": ["vectordb = Chroma(\n", " persist_directory=persist_directory,\n", " embedding_function=embedding\n", ")"]}, {"cell_type": "code", "execution_count": 7, "id": "3659e0f7", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["209\n"]}], "source": ["print(vectordb._collection.count())"]}, {"cell_type": "code", "execution_count": 8, "id": "a01ab000", "metadata": {}, "outputs": [], "source": ["vectordb_chinese = Chroma(\n", " persist_directory=persist_directory_chinese,\n", " embedding_function=embedding\n", ")"]}, {"cell_type": "code", "execution_count": 9, "id": "a6998a03", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["27\n"]}], "source": ["print(vectordb_chinese._collection.count())"]}, {"attachments": {}, "cell_type": "markdown", "id": "9ae4fdd8", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u73b0\u5728\u6765\u770b\u770b\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027\u7684\u4f8b\u5b50\u3002\u56e0\u6b64\uff0c\u6211\u4eec\u5c06\u4ece\u4e0b\u9762\u793a\u4f8b\u4e2d\u52a0\u8f7d\u6709\u5173\u8611\u83c7\u7684\u4fe1\u606f\u3002\n", "\n", "\u8ba9\u6211\u4eec\u73b0\u5728\u8fd0\u884c\u5b83\u4e0eMMR\u3002\u8ba9\u6211\u4eec\u4f20\u5165k\u7b49\u4e8e2\u3002\u6211\u4eec\u4ecd\u7136\u5e0c\u671b\u8fd4\u56de\u4e24\u4e2a\u6587\u6863\uff0c\u4f46\u8ba9\u6211\u4eec\u8bbe\u7f6e\u83b7\u53d6k\u7b49\u4e8e3\uff0c\u5176\u4e2d\u6211\u4eec\u6700\u521d\u83b7\u53d6\u6240\u6709\u4e09\u4e2a\u6587\u6863\u3002\u6211\u4eec\u73b0\u5728\u53ef\u4ee5\u770b\u5230\uff0c\u6211\u4eec\u68c0\u7d22\u7684\u6587\u6863\u4e2d\u8fd4\u56de\u4e86\u6709\u6bd2\u7684\u4fe1\u606f\u3002"]}, {"cell_type": "code", "execution_count": 10, "id": "a807c758", "metadata": {"tags": []}, "outputs": [], "source": ["texts = [\n", " \"\"\"The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).\"\"\",\n", " \"\"\"A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.\"\"\",\n", " \"\"\"A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.\"\"\",\n", "]"]}, {"cell_type": "code", "execution_count": 11, "id": "b110cceb", "metadata": {}, "outputs": [], "source": ["texts_chinese = [\n", " \"\"\"\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u5177\u6709\u5927\u578b\u4e14\u5f15\u4eba\u6ce8\u76ee\u7684\u5730\u4e0a\uff08epigeous\uff09\u5b50\u5b9e\u4f53\uff08basidiocarp\uff09\"\"\",\n", " \"\"\"\u4e00\u79cd\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u8611\u83c7\u662f\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u3002\u67d0\u4e9b\u54c1\u79cd\u5168\u767d\u3002\"\"\",\n", " \"\"\"A. phalloides\uff0c\u53c8\u540d\u6b7b\u4ea1\u5e3d\uff0c\u662f\u5df2\u77e5\u6240\u6709\u8611\u83c7\u4e2d\u6700\u6709\u6bd2\u7684\u4e00\u79cd\u3002\"\"\",\n", "]"]}, {"attachments": {}, "cell_type": "markdown", "id": "84cd5f1c", "metadata": {}, "source": ["\u5bf9\u4e8e\u8fd9\u4e2a\u4f8b\u5b50\uff0c\u6211\u4eec\u5c06\u521b\u5efa\u4e00\u4e2a\u5c0f\u6570\u636e\u5e93\uff0c\u6211\u4eec\u53ef\u4ee5\u4f5c\u4e3a\u4e00\u4e2a\u793a\u4f8b\u6765\u4f7f\u7528\u3002"]}, {"cell_type": "code", "execution_count": 12, "id": "715d54f3", "metadata": {"tags": []}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 1.28it/s]\n"]}], "source": ["smalldb = Chroma.from_texts(texts, embedding=embedding)"]}, {"cell_type": "code", "execution_count": 13, "id": "305e1714", "metadata": {}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 2.30it/s]\n"]}], "source": ["smalldb_chinese = Chroma.from_texts(texts_chinese, embedding=embedding)"]}, {"attachments": {}, "cell_type": "markdown", "id": "239a8d95", "metadata": {}, "source": ["\u4e0b\u9762\u662f\u6211\u4eec\u5bf9\u4e8e\u8fd9\u4e2a\u793a\u4f8b\u6240\u63d0\u51fa\u7684\u95ee\u9898"]}, {"cell_type": "code", "execution_count": 14, "id": "9a37b5a5", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"Tell me about all-white mushrooms with large fruiting bodies\""]}, {"cell_type": "code", "execution_count": 15, "id": "92312e57", "metadata": {}, "outputs": [], "source": ["question_chinese = \"\u544a\u8bc9\u6211\u5173\u4e8e\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u5168\u767d\u8272\u8611\u83c7\u7684\u4fe1\u606f\""]}, {"attachments": {}, "cell_type": "markdown", "id": "d3224a6d", "metadata": {}, "source": ["\u73b0\u5728\uff0c\u6211\u4eec\u53ef\u4ee5\u8fd0\u884c\u4e00\u4e2a\u76f8\u4f3c\u6027\u641c\u7d22\uff0c\u8bbe\u7f6ek=2\uff0c\u53ea\u8fd4\u56de\u4e24\u4e2a\u6700\u76f8\u5173\u7684\u6587\u6863\u3002\n", "\n", "\u6211\u4eec\u53ef\u4ee5\u770b\u5230\uff0c\u6ca1\u6709\u63d0\u5230\u5b83\u662f\u6709\u6bd2\u7684\u4e8b\u5b9e\u3002"]}, {"cell_type": "code", "execution_count": 16, "id": "24e3b025", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),\n", " Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).', metadata={})]"]}, "execution_count": 16, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb.similarity_search(question, k=2)"]}, {"cell_type": "code", "execution_count": 17, "id": "d4c5a47d", "metadata": {}, "outputs": [{"data": {"text/plain": ["[Document(page_content='\u4e00\u79cd\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u8611\u83c7\u662f\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u3002\u67d0\u4e9b\u54c1\u79cd\u5168\u767d\u3002', metadata={}),\n", " Document(page_content='\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u5177\u6709\u5927\u578b\u4e14\u5f15\u4eba\u6ce8\u76ee\u7684\u5730\u4e0a\uff08epigeous\uff09\u5b50\u5b9e\u4f53\uff08basidiocarp\uff09', metadata={})]"]}, "execution_count": 17, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb_chinese.similarity_search(question_chinese, k=2)"]}, {"attachments": {}, "cell_type": "markdown", "id": "bbb0ea94", "metadata": {}, "source": ["\u73b0\u5728\uff0c\u8ba9\u6211\u4eec\u8fd0\u884c\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR)\u3002\n", "\n", "\u8bbe\u7f6ek=2\uff0c\u56e0\u4e3a\u6211\u4eec\u4ecd\u7136\u5e0c\u671b\u8fd4\u56de\u4e24\u4e2a\u6587\u6863\u3002\u8bbe\u7f6efetch_k=3\uff0cfetch_k\u662f\u6211\u4eec\u6700\u521d\u83b7\u53d6\u7684\u6240\u6709\u6587\u6863(3\u4e2a)\u3002"]}, {"cell_type": "code", "execution_count": 18, "id": "4daa6c0d", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),\n", " Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.', metadata={})]"]}, "execution_count": 18, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)"]}, {"cell_type": "code", "execution_count": 19, "id": "e15521d2", "metadata": {}, "outputs": [{"data": {"text/plain": ["[Document(page_content='\u4e00\u79cd\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u8611\u83c7\u662f\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u3002\u67d0\u4e9b\u54c1\u79cd\u5168\u767d\u3002', metadata={}),\n", " Document(page_content='A. phalloides\uff0c\u53c8\u540d\u6b7b\u4ea1\u5e3d\uff0c\u662f\u5df2\u77e5\u6240\u6709\u8611\u83c7\u4e2d\u6700\u6709\u6bd2\u7684\u4e00\u79cd\u3002', metadata={})]"]}, "execution_count": 19, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb_chinese.max_marginal_relevance_search(question,k=2, fetch_k=3)"]}, {"attachments": {}, "cell_type": "markdown", "id": "e87c5f91", "metadata": {}, "source": ["\u6211\u4eec\u73b0\u5728\u53ef\u4ee5\u770b\u5230\uff0c\u6211\u4eec\u68c0\u7d22\u7684\u6587\u6863\u4e2d\u8fd4\u56de\u4e86\u6709\u6bd2\u7684\u4fe1\u606f\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "5a29e8c9", "metadata": {}, "source": ["### 1.2 \u89e3\u51b3\u591a\u6837\u6027\uff1a\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR)"]}, {"attachments": {}, "cell_type": "markdown", "id": "a2b5c4ae", "metadata": {}, "source": ["\n", "\u6211\u4eec\u521a\u521a\u901a\u8fc7\u4e00\u4e2a\u793a\u4f8b\u5f15\u51fa\u4e86\u4e00\u4e2a\u95ee\u9898\uff1a\u5982\u4f55\u52a0\u5f3a\u641c\u7d22\u7ed3\u679c\u7684\u591a\u6837\u6027\u3002\n", " \n", "\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(`Maximum marginal relevance`)\u8bd5\u56fe\u5728\u67e5\u8be2\u7684\u76f8\u5173\u6027\u548c\u7ed3\u679c\u7684\u591a\u6837\u6027\u4e4b\u95f4\u5b9e\u73b0\u4e24\u5168\u5176\u7f8e\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "2360545c", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u56de\u5230\u4e0a\u4e00\u8282\u8bfe\u7684\u4e00\u4e2a\u4f8b\u5b50\uff0c\u5f53\u6211\u4eec\u901a\u8fc7\u95ee\u9898\u5bf9\u5411\u91cf\u6570\u636e\u5e93\u8fdb\u884c\u76f8\u4f3c\u6027\u641c\u7d22\u540e\n", "\n", "\u6211\u4eec\u53ef\u4ee5\u770b\u770b\u524d\u4e24\u4e2a\u6587\u6863\uff0c\u53ea\u770b\u524d\u51e0\u4e2a\u5b57\u7b26\uff0c\u53ef\u4ee5\u770b\u5230\u5b83\u4eec\u662f\u76f8\u540c\u7684\u3002"]}, {"cell_type": "code", "execution_count": 20, "id": "9bb2c0a9", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"what did they say about matlab?\"\n", "docs_ss = vectordb.similarity_search(question,k=3)"]}, {"cell_type": "code", "execution_count": 21, "id": "f07f8793", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'those homeworks will be done in either MATLA B or in Octave, which is sort of \u2014 I \\nknow some people '"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss[0].page_content[:100]"]}, {"cell_type": "code", "execution_count": 22, "id": "e9f7e165", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'those homeworks will be done in either MATLA B or in Octave, which is sort of \u2014 I \\nknow some people '"]}, "execution_count": 22, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss[1].page_content[:100]"]}, {"cell_type": "code", "execution_count": 23, "id": "e8e142eb", "metadata": {}, "outputs": [], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "docs_ss_chinese = vectordb_chinese.similarity_search(question_chinese,k=3)"]}, {"cell_type": "code", "execution_count": 24, "id": "cf642f66", "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd'"]}, "execution_count": 24, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss_chinese[0].page_content[:100]"]}, {"cell_type": "code", "execution_count": 25, "id": "1e9f5cfe", "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd'"]}, "execution_count": 25, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss_chinese[1].page_content[:100]"]}, {"attachments": {}, "cell_type": "markdown", "id": "4c4ca1b6", "metadata": {}, "source": ["\u6ce8\u610f\uff1a\u4f7f\u7528`MMR`\u6240\u5f97\u51fa\u7ed3\u679c\u7684\u5dee\u5f02\u3002"]}, {"cell_type": "code", "execution_count": 26, "id": "222234c5", "metadata": {"tags": []}, "outputs": [], "source": ["docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)"]}, {"cell_type": "code", "execution_count": 27, "id": "408935bc", "metadata": {}, "outputs": [], "source": ["docs_mmr_chinese = vectordb_chinese.max_marginal_relevance_search(question_chinese,k=3)"]}, {"attachments": {}, "cell_type": "markdown", "id": "9076db81", "metadata": {}, "source": ["\u5f53\u6211\u4eec\u8fd0\u884cMMR\u540e\u5f97\u5230\u7ed3\u679c\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u7b2c\u4e00\u4e2a\u4e0e\u4e4b\u524d\u7684\u76f8\u540c\uff0c\u56e0\u4e3a\u90a3\u662f\u6700\u76f8\u4f3c\u7684\u3002"]}, {"cell_type": "code", "execution_count": 28, "id": "93b20226", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'those homeworks will be done in either MATLA B or in Octave, which is sort of \u2014 I \\nknow some people '"]}, "execution_count": 28, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr[0].page_content[:100]"]}, {"cell_type": "code", "execution_count": 29, "id": "d0acfaab", "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd'"]}, "execution_count": 29, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr_chinese[0].page_content[:100]"]}, {"attachments": {}, "cell_type": "markdown", "id": "7a93743f", "metadata": {}, "source": ["\u4f46\u662f\u5f53\u6211\u4eec\u8fdb\u884c\u5230\u7b2c\u4e8c\u4e2a\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u5b83\u662f\u4e0d\u540c\u7684\u3002\n", "\n", "\u5b83\u5728\u56de\u5e94\u4e2d\u83b7\u5f97\u4e86\u4e00\u4e9b\u591a\u6837\u6027\u3002"]}, {"cell_type": "code", "execution_count": 30, "id": "17d39762", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'algorithm then? So what\u2019s different? How come I was making all that noise earlier about \\nleast squa'"]}, "execution_count": 30, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr[1].page_content[:100]"]}, {"cell_type": "code", "execution_count": 31, "id": "93d3206c", "metadata": {}, "outputs": [{"data": {"text/plain": ["'By Datawhale \u6570\u636e\u53ef\u89c6\u5316\u5f00\u6e90\u2f29\u7ec4\\n\u00a9 Copyright \u00a9 Copyright 2021.y\u8f74\u5206\u4e3a\u5de6\u53f3\u4e24\u4e2a\uff0c\u56e0\u6b64 tick1 \u5bf9\u5e94\u5de6\u4fa7\u7684\u8f74\uff1b tick2 \u5bf9\u5e94\u53f3\u4fa7\u7684\u8f74\u3002\\nx\u8f74\u5206\u4e3a\u4e0a\u4e0b\u4e24\u4e2a'"]}, "execution_count": 31, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr_chinese[1].page_content[:100]"]}, {"attachments": {}, "cell_type": "markdown", "id": "b2b909bc", "metadata": {}, "source": ["### 1.3 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u4f7f\u7528\u5143\u6570\u636e"]}, {"attachments": {}, "cell_type": "markdown", "id": "7b63c5ee", "metadata": {}, "source": ["\n", "\u5728\u4e0a\u4e00\u8282\u8bfe\u4e2d\uff0c\u6211\u4eec\u5c55\u793a\u4e86\u4e00\u4e2a\u95ee\u9898\uff0c\u662f\u8be2\u95ee\u4e86\u5173\u4e8e\u6587\u6863\u4e2d\u67d0\u4e00\u8bb2\u7684\u95ee\u9898\uff0c\u4f46\u5f97\u5230\u7684\u7ed3\u679c\u4e2d\u4e5f\u5305\u62ec\u4e86\u6765\u81ea\u5176\u4ed6\u8bb2\u7684\u7ed3\u679c\u3002\n", "\n", "\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e00\u95ee\u9898\uff0c\u5f88\u591a\u5411\u91cf\u6570\u636e\u5e93\u90fd\u652f\u6301\u5bf9`metadata`\u7684\u64cd\u4f5c\u3002\n", "\n", "`metadata`\u4e3a\u6bcf\u4e2a\u5d4c\u5165\u7684\u5757(embedded chunk)\u63d0\u4f9b\u4e0a\u4e0b\u6587\u3002"]}, {"cell_type": "code", "execution_count": 32, "id": "3c1a60b2", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"what did they say about regression in the third lecture?\""]}, {"cell_type": "code", "execution_count": 33, "id": "ba98df3c", "metadata": {}, "outputs": [], "source": ["question_chinese = \"\u4ed6\u4eec\u5728\u7b2c\u4e8c\u8bb2\u4e2d\u5bf9Figure\u8bf4\u4e86\u4e9b\u4ec0\u4e48\uff1f\" "]}, {"attachments": {}, "cell_type": "markdown", "id": "3873525e", "metadata": {}, "source": ["\u73b0\u5728\uff0c\u6211\u4eec\u4ee5\u624b\u52a8\u7684\u65b9\u5f0f\u6765\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u6211\u4eec\u4f1a\u6307\u5b9a\u4e00\u4e2a\u5143\u6570\u636e\u8fc7\u6ee4\u5668`filter`"]}, {"cell_type": "code", "execution_count": 34, "id": "a8612840", "metadata": {"tags": []}, "outputs": [], "source": ["docs = vectordb.similarity_search(\n", " question,\n", " k=3,\n", " filter={\"source\":\"docs/cs229_lectures/MachineLearning-Lecture03.pdf\"}\n", ")"]}, {"cell_type": "code", "execution_count": 35, "id": "b46c7e76", "metadata": {}, "outputs": [], "source": ["docs_chinese = vectordb_chinese.similarity_search(\n", " question_chinese,\n", " k=3,\n", " filter={\"source\":\"docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf\"}\n", ")"]}, {"attachments": {}, "cell_type": "markdown", "id": "869aee28", "metadata": {}, "source": ["\u63a5\u4e0b\u6765\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u7ed3\u679c\u90fd\u6765\u81ea\u5bf9\u5e94\u7684\u7ae0\u8282"]}, {"cell_type": "code", "execution_count": 36, "id": "97031876", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 4}\n"]}], "source": ["for d in docs:\n", " print(d.metadata)"]}, {"cell_type": "code", "execution_count": 35, "id": "2708f6ae", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'source': 'docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf', 'page': 9}\n", "{'source': 'docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf', 'page': 10}\n", "{'source': 'docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf', 'page': 0}\n"]}], "source": ["for d in docs_chinese:\n", " print(d.metadata)\n", " "]}, {"attachments": {}, "cell_type": "markdown", "id": "5e299f8e", "metadata": {}, "source": ["\u5f53\u7136\uff0c\u6211\u4eec\u4e0d\u80fd\u6bcf\u6b21\u90fd\u91c7\u7528\u624b\u52a8\u7684\u65b9\u5f0f\u6765\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u8fd9\u4f1a\u663e\u5f97\u4e0d\u591f\u667a\u80fd\n", "\n", "\u4e0b\u4e00\u5c0f\u8282\u5c06\u8981\u5c55\u793a\u901a\u8fc7LLM\u6765\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898"]}, {"attachments": {}, "cell_type": "markdown", "id": "ccc2d784", "metadata": {}, "source": ["### 1.4 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u5728\u5143\u6570\u636e\u4e2d\u4f7f\u7528\u81ea\u67e5\u8be2\u68c0\u7d22\u5668"]}, {"attachments": {}, "cell_type": "markdown", "id": "82ef44b6", "metadata": {}, "source": ["\u6211\u4eec\u6709\u4e00\u4e2a\u6709\u8da3\u7684\u6311\u6218\uff1a\u6211\u4eec\u901a\u5e38\u5e0c\u671b\u4ece\u67e5\u8be2\u672c\u8eab\u6765\u63a8\u65ad\u5143\u6570\u636e\u3002\n", "\n", "\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528SelfQueryRetriever\uff0c\u5b83\u4f7f\u7528LLM\u6765\u63d0\u53d6\uff1a\n", " \n", "1. \u7528\u4e8e\u5411\u91cf\u641c\u7d22\u7684\u67e5\u8be2(`query`)\u5b57\u7b26\u4e32\uff0c\u5373\uff1a\u95ee\u9898\n", "2. \u8981\u4e00\u8d77\u4f20\u5165\u7684\u5143\u6570\u636e\u8fc7\u6ee4\u5668\n", "\n", "\u5927\u591a\u6570\u5411\u91cf\u6570\u636e\u5e93\u652f\u6301\u5143\u6570\u636e\u8fc7\u6ee4\u5668\uff0c\u56e0\u6b64\u4e0d\u9700\u8981\u4efb\u4f55\u65b0\u7684\u6570\u636e\u5e93\u53ca\u7d22\u5f15\u3002"]}, {"cell_type": "code", "execution_count": 38, "id": "b1d06084", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.llms import OpenAI\n", "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", "from langchain.chains.query_constructor.base import AttributeInfo"]}, {"cell_type": "code", "execution_count": 39, "id": "869c27c0", "metadata": {}, "outputs": [], "source": ["llm = OpenAI(temperature=0)"]}, {"attachments": {}, "cell_type": "markdown", "id": "acd194c5", "metadata": {}, "source": ["`AttributeInfo`\u662f\u6211\u4eec\u53ef\u4ee5\u6307\u5b9a\u5143\u6570\u636e\u4e2d\u7684\u4e0d\u540c\u5b57\u6bb5\u4ee5\u53ca\u5b83\u4eec\u5bf9\u5e94\u7684\u4f4d\u7f6e\u3002\n", "\n", "\u5728\u5143\u6570\u636e\u4e2d\uff0c\u6211\u4eec\u53ea\u6709\u4e24\u4e2a\u5b57\u6bb5\uff0c`source`\u548c`page`\u3002\n", "\n", "\u6211\u4eec\u5c06\u586b\u5199\u6bcf\u4e2a\u5c5e\u6027\u7684\u540d\u79f0\u3001\u63cf\u8ff0\u548c\u7c7b\u578b\u7684\u63cf\u8ff0\u3002\n", "\n", "\u8fd9\u4e9b\u4fe1\u606f\u5b9e\u9645\u4e0a\u5c06\u88ab\u4f20\u9012\u7ed9LLM\uff0c\u6240\u4ee5\u9700\u8981\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u63cf\u8ff0\u3002"]}, {"cell_type": "code", "execution_count": 40, "id": "0aa5e698", "metadata": {"tags": []}, "outputs": [], "source": ["metadata_field_info = [\n", " AttributeInfo(\n", " name=\"source\",\n", " description=\"The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`\",\n", " type=\"string\",\n", " ),\n", " AttributeInfo(\n", " name=\"page\",\n", " description=\"The page from the lecture\",\n", " type=\"integer\",\n", " ),\n", "]"]}, {"cell_type": "code", "execution_count": 41, "id": "544ad7c1", "metadata": {}, "outputs": [], "source": ["metadata_field_info_chinese = [\n", " AttributeInfo(\n", " name=\"source\",\n", " description=\"\u8bb2\u4e49\u6765\u6e90\u4e8e `docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf`, `docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf`, or `docs/matplotlib/\u7b2c\u4e09\u56de\uff1a\u5e03\u5c40\u683c\u5f0f\u5b9a\u65b9\u5706.pdf` \u7684\u5176\u4e2d\u4e4b\u4e00\",\n", " type=\"string\",\n", " ),\n", " AttributeInfo(\n", " name=\"page\",\n", " description=\"\u8bb2\u4e49\u7684\u90a3\u4e00\u9875\",\n", " type=\"integer\",\n", " ),\n", "]"]}, {"cell_type": "code", "execution_count": 42, "id": "e7906c15", "metadata": {"tags": []}, "outputs": [], "source": ["document_content_description = \"Lecture notes\"\n", "retriever = SelfQueryRetriever.from_llm(\n", " llm,\n", " vectordb,\n", " document_content_description,\n", " metadata_field_info,\n", " verbose=True\n", ")"]}, {"cell_type": "code", "execution_count": 43, "id": "d5b99571", "metadata": {}, "outputs": [], "source": ["document_content_description_chinese = \"\u8bfe\u5802\u8bb2\u4e49\"\n", "retriever_chinese = SelfQueryRetriever.from_llm(\n", " llm,\n", " vectordb_chinese,\n", " document_content_description_chinese,\n", " metadata_field_info_chinese,\n", " verbose=True\n", ")"]}, {"cell_type": "code", "execution_count": 44, "id": "79d781b9", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"what did they say about regression in the third lecture?\""]}, {"cell_type": "code", "execution_count": 45, "id": "8d9b7e18", "metadata": {}, "outputs": [], "source": ["question_chinese = \"\u4ed6\u4eec\u5728\u7b2c\u4e8c\u8bb2\u4e2d\u5bf9Figure\u8bf4\u4e86\u4e9b\u4ec0\u4e48\uff1f\" "]}, {"attachments": {}, "cell_type": "markdown", "id": "c51778b0-1fcd-40a4-bd6b-0f13fec8acb1", "metadata": {}, "source": ["\u5f53\u4f60\u7b2c\u4e00\u6b21\u6267\u884c\u4e0b\u4e00\u884c\u65f6\uff0c\u4f60\u4f1a\u6536\u5230\u5173\u4e8epredict_and_parse\u5df2\u88ab\u5f03\u7528\u7684**\u8b66\u544a**\u3002 \u8fd9\u53ef\u4ee5\u5b89\u5168\u5730\u5ffd\u7565\u3002"]}, {"cell_type": "code", "execution_count": 46, "id": "1d4f9f7d", "metadata": {"tags": []}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["/root/autodl-tmp/env/gpt/lib/python3.10/site-packages/langchain/chains/llm.py:275: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", " warnings.warn(\n"]}, {"name": "stdout", "output_type": "stream", "text": ["query='regression' filter=Comparison(comparator=, attribute='source', value='docs/cs229_lectures/MachineLearning-Lecture03.pdf') limit=None\n"]}], "source": ["docs = retriever.get_relevant_documents(question)"]}, {"cell_type": "code", "execution_count": 47, "id": "ea39a97e", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["query='Figure' filter=Comparison(comparator=, attribute='source', value='docs/matplotlib/\u7b2c\u4e8c\u8bb2\uff1a\u827a\u672f\u753b\u89e3\u7834.pdf') limit=None\n"]}], "source": ["docs_chinese = retriever_chinese.get_relevant_documents(question_chinese)"]}, {"cell_type": "code", "execution_count": 48, "id": "db04374e", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 10}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 10}\n"]}], "source": ["for d in docs:\n", " print(d.metadata)"]}, {"cell_type": "code", "execution_count": 49, "id": "143061f5", "metadata": {}, "outputs": [], "source": ["for d in docs_chinese:\n", " print(d.metadata)"]}, {"attachments": {}, "cell_type": "markdown", "id": "297b8168", "metadata": {}, "source": ["### 1.5 \u5176\u4ed6\u6280\u5de7\uff1a\u538b\u7f29"]}, {"attachments": {}, "cell_type": "markdown", "id": "564144da", "metadata": {}, "source": ["\u53e6\u4e00\u79cd\u63d0\u9ad8\u68c0\u7d22\u5230\u7684\u6587\u6863\u8d28\u91cf\u7684\u65b9\u6cd5\u662f\u538b\u7f29\u3002\n", "\n", "\u4e0e\u67e5\u8be2\u6700\u76f8\u5173\u7684\u4fe1\u606f\u53ef\u80fd\u9690\u85cf\u5728\u5177\u6709\u5927\u91cf\u4e0d\u76f8\u5173\u6587\u672c\u7684\u6587\u6863\u4e2d\u3002\n", "\n", "\u5728\u5e94\u7528\u7a0b\u5e8f\u4e2d\u4f20\u9012\u5b8c\u6574\u7684\u6587\u6863\u53ef\u80fd\u4f1a\u5bfc\u81f4\u66f4\u6602\u8d35\u7684LLM\u8c03\u7528\u548c\u66f4\u5dee\u7684\u54cd\u5e94\u3002\n", "\n", "\u4e0a\u4e0b\u6587\u538b\u7f29\u5c31\u662f\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\u3002"]}, {"cell_type": "code", "execution_count": 50, "id": "a060cf74", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.retrievers import ContextualCompressionRetriever\n", "from langchain.retrievers.document_compressors import LLMChainExtractor"]}, {"cell_type": "code", "execution_count": 51, "id": "038649c8", "metadata": {"tags": []}, "outputs": [], "source": ["def pretty_print_docs(docs):\n", " print(f\"\\n{'-' * 100}\\n\".join([f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]))"]}, {"cell_type": "code", "execution_count": 52, "id": "fc686cf2", "metadata": {"tags": []}, "outputs": [], "source": ["llm = OpenAI(temperature=0)\n", "compressor = LLMChainExtractor.from_llm(llm) # \u538b\u7f29\u5668"]}, {"cell_type": "code", "execution_count": 53, "id": "82794397", "metadata": {"tags": []}, "outputs": [], "source": ["compression_retriever = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 54, "id": "915598f8", "metadata": {}, "outputs": [], "source": ["compression_retriever_chinese = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb_chinese.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 55, "id": "cde86848", "metadata": {"tags": []}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["/root/autodl-tmp/env/gpt/lib/python3.10/site-packages/langchain/chains/llm.py:275: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", " warnings.warn(\n"]}, {"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 2:\n", "\n", "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 3:\n", "\n", "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 4:\n", "\n", "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n"]}], "source": ["question = \"what did they say about matlab?\"\n", "compressed_docs = compression_retriever.get_relevant_documents(question)\n", "pretty_print_docs(compressed_docs)"]}, {"cell_type": "code", "execution_count": 56, "id": "39726b24", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "Matplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\n", "----------------------------------------------------------------------------------------------------\n", "Document 2:\n", "\n", "Matplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\n"]}], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "compressed_docs_chinese = compression_retriever_chinese.get_relevant_documents(question_chinese)\n", "pretty_print_docs(compressed_docs_chinese)"]}, {"attachments": {}, "cell_type": "markdown", "id": "049b2601", "metadata": {}, "source": ["\u73b0\u5728\u5f53\u6211\u4eec\u63d0\u51fa\u95ee\u9898\u540e\uff0c\u67e5\u770b\u7ed3\u679c\u6587\u6863\n", "\n", "\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u4e24\u4ef6\u4e8b\u3002\n", "\n", "1. \u5b83\u4eec\u6bd4\u6b63\u5e38\u6587\u6863\u77ed\u5f88\u591a\n", "2. \u4ecd\u7136\u6709\u4e00\u4e9b\u91cd\u590d\u7684\u4e1c\u897f\uff0c\u8fd9\u662f\u56e0\u4e3a\u5728\u5e95\u5c42\u6211\u4eec\u4f7f\u7528\u7684\u662f\u8bed\u4e49\u641c\u7d22\u7b97\u6cd5\u3002\n", "\n", "\u8fd9\u5c31\u662f\u6211\u4eec\u5728\u672c\u8bfe\u7a0b\u524d\u9762\u4f7f\u7528MMR\u89e3\u51b3\u7684\u95ee\u9898\u3002\n", "\n", "\u8fd9\u662f\u4e00\u4e2a\u5f88\u597d\u7684\u4f8b\u5b50\uff0c\u4f60\u53ef\u4ee5\u7ed3\u5408\u5404\u79cd\u6280\u672f\u5f97\u5230\u6700\u597d\u7684\u53ef\u80fd\u7ed3\u679c\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "82c4fc4d", "metadata": {}, "source": ["## \u4e8c\u3001\u7ed3\u5408\u5404\u79cd\u6280\u672f"]}, {"attachments": {}, "cell_type": "markdown", "id": "54432975", "metadata": {}, "source": ["\u4e3a\u4e86\u505a\u5230\u8fd9\u4e00\u70b9\uff0c\u6211\u4eec\u5728\u4ece\u5411\u91cf\u6570\u636e\u5e93\u521b\u5efa\u68c0\u7d22\u5668\u65f6\uff0c\u53ef\u4ee5\u5c06\u641c\u7d22\u7c7b\u578b\u8bbe\u7f6e\u4e3aMMR\u3002\n", "\n", "\u7136\u540e\u6211\u4eec\u53ef\u4ee5\u91cd\u65b0\u8fd0\u884c\u8fd9\u4e2a\u8fc7\u7a0b\uff0c\u770b\u5230\u6211\u4eec\u8fd4\u56de\u7684\u662f\u4e00\u4e2a\u8fc7\u6ee4\u8fc7\u7684\u7ed3\u679c\u96c6\uff0c\u5176\u4e2d\u4e0d\u5305\u542b\u4efb\u4f55\u91cd\u590d\u7684\u4fe1\u606f\u3002"]}, {"cell_type": "code", "execution_count": 57, "id": "161ae1ad", "metadata": {"tags": []}, "outputs": [], "source": ["compression_retriever = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb.as_retriever(search_type = \"mmr\")\n", ")"]}, {"cell_type": "code", "execution_count": 58, "id": "cd6396bb", "metadata": {}, "outputs": [], "source": ["compression_retriever_chinese = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb_chinese.as_retriever(search_type = \"mmr\")\n", ")"]}, {"cell_type": "code", "execution_count": 59, "id": "e77ccae1", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 2:\n", "\n", "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n"]}], "source": ["question = \"what did they say about matlab?\"\n", "compressed_docs = compression_retriever.get_relevant_documents(question)\n", "pretty_print_docs(compressed_docs)"]}, {"cell_type": "code", "execution_count": 60, "id": "fe68a14b", "metadata": {}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised APIError: Request failed due to server shutdown {\n", " \"error\": {\n", " \"message\": \"Request failed due to server shutdown\",\n", " \"type\": \"server_error\",\n", " \"param\": null,\n", " \"code\": null\n", " }\n", "}\n", " 500 {'error': {'message': 'Request failed due to server shutdown', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Sun, 16 Jul 2023 05:28:06 GMT', 'Content-Type': 'application/json', 'Content-Length': '141', 'Connection': 'keep-alive', 'access-control-allow-origin': '*', 'openai-model': 'text-davinci-003', 'openai-organization': 'user-xnghkpntwvm31crtmex7n2j0', 'openai-processing-ms': '1159', 'openai-version': '2020-10-01', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-limit-requests': '3000', 'x-ratelimit-limit-tokens': '250000', 'x-ratelimit-remaining-requests': '2999', 'x-ratelimit-remaining-tokens': '249744', 'x-ratelimit-reset-requests': '20ms', 'x-ratelimit-reset-tokens': '61ms', 'x-request-id': '22b2986140ddb5bcc689423e056b7daf', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '7e77d6f7590b1ec8-NRT', 'alt-svc': 'h3=\":443\"; ma=86400'}.\n"]}, {"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "Matplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\n"]}], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "compressed_docs_chinese = compression_retriever_chinese.get_relevant_documents(question_chinese)\n", "pretty_print_docs(compressed_docs_chinese)"]}, {"attachments": {}, "cell_type": "markdown", "id": "6c2b63e1", "metadata": {}, "source": ["## \u4e09\u3001\u5176\u4ed6\u7c7b\u578b\u7684\u68c0\u7d22"]}, {"attachments": {}, "cell_type": "markdown", "id": "3e777a7b", "metadata": {}, "source": ["\u503c\u5f97\u6ce8\u610f\u7684\u662f\uff0cvetordb\u5e76\u4e0d\u662f\u552f\u4e00\u4e00\u79cd\u68c0\u7d22\u6587\u6863\u7684\u5de5\u5177\u3002\n", "\n", "`LangChain`\u68c0\u7d22\u5668\u62bd\u8c61\u5305\u62ec\u5176\u4ed6\u68c0\u7d22\u6587\u6863\u7684\u65b9\u5f0f\uff0c\u5982\uff1a`TF-IDF` \u6216 `SVM`\u3002"]}, {"cell_type": "code", "execution_count": 61, "id": "83d2e808", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.retrievers import SVMRetriever\n", "from langchain.retrievers import TFIDFRetriever\n", "from langchain.document_loaders import PyPDFLoader\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter"]}, {"cell_type": "code", "execution_count": 62, "id": "bcf5b760", "metadata": {"tags": []}, "outputs": [], "source": ["# \u52a0\u8f7dPDF\n", "loader = PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\")\n", "pages = loader.load()\n", "all_page_text = [p.page_content for p in pages]\n", "joined_page_text = \" \".join(all_page_text)\n", "\n", "# \u5206\u5272\u6587\u672c\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)\n", "splits = text_splitter.split_text(joined_page_text)\n"]}, {"cell_type": "code", "execution_count": 77, "id": "1657e768", "metadata": {}, "outputs": [], "source": ["# \u52a0\u8f7dPDF\n", "loader_chinese = PyPDFLoader(\"docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf\")\n", "pages_chinese = loader_chinese.load()\n", "all_page_text_chinese = [p.page_content for p in pages_chinese]\n", "joined_page_text_chinese = \" \".join(all_page_text_chinese)\n", "\n", "# \u5206\u5272\u6587\u672c\n", "text_splitter_chinese = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)\n", "splits_chinese = text_splitter_chinese.split_text(joined_page_text_chinese)"]}, {"cell_type": "code", "execution_count": 64, "id": "59abbaff", "metadata": {}, "outputs": [], "source": ["# \u68c0\u7d22\n", "svm_retriever = SVMRetriever.from_texts(splits, embedding)\n", "tfidf_retriever = TFIDFRetriever.from_texts(splits)"]}, {"cell_type": "code", "execution_count": 66, "id": "7885389e", "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content=\"let me just check what questions you have righ t now. So if there are no questions, I'll just \\nclose with two reminders, which are after class today or as you start to talk with other \\npeople in this class, I just encourage you again to start to form project partners, to try to \\nfind project partners to do your project with. And also, this is a good time to start forming \\nstudy groups, so either talk to your friends or post in the newsgroup, but we just \\nencourage you to try to star t to do both of those today, okay? Form study groups, and try \\nto find two other project partners. \\nSo thank you. I'm looking forward to teaching this class, and I'll see you in a couple of \\ndays. [End of Audio] \\nDuration: 69 minutes\", metadata={})"]}, "execution_count": 66, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"What are major topics for this class?\" # \u8fd9\u95e8\u8bfe\u7684\u4e3b\u8981\u4e3b\u9898\u662f\u4ec0\u4e48\uff1f\n", "docs_svm = svm_retriever.get_relevant_documents(question)\n", "docs_svm[0]"]}, {"cell_type": "code", "execution_count": 67, "id": "2a1659c0", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["Document(page_content=\"Saxena and Min Sun here did, wh ich is given an image like this, right? This is actually a \\npicture taken of the Stanford campus. You can apply that sort of cl ustering algorithm and \\ngroup the picture into regions. Let me actually blow that up so that you can see it more \\nclearly. Okay. So in the middle, you see the lines sort of groupi ng the image together, \\ngrouping the image into [inaudible] regions. \\nAnd what Ashutosh and Min did was they then applied the learning algorithm to say can \\nwe take this clustering and us e it to build a 3D model of the world? And so using the \\nclustering, they then had a lear ning algorithm try to learn what the 3D structure of the \\nworld looks like so that they could come up with a 3D model that you can sort of fly \\nthrough, okay? Although many people used to th ink it's not possible to take a single \\nimage and build a 3D model, but using a lear ning algorithm and that sort of clustering \\nalgorithm is the first step. They were able to. \\nI'll just show you one more example. I like this because it's a picture of Stanford with our \\nbeautiful Stanford campus. So again, taking th e same sort of clustering algorithms, taking \\nthe same sort of unsupervised learning algor ithm, you can group the pixels into different \\nregions. And using that as a pre-processing step, they eventually built this sort of 3D model of Stanford campus in a single picture. You can sort of walk into the ceiling, look\", metadata={})"]}, "execution_count": 67, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"what did they say about matlab?\" # \u4ed6\u4eec\u5173\u4e8eMatlab\u8bf4\u4e86\u4e9b\u4ec0\u4e48\uff1f\n", "docs_tfidf = tfidf_retriever.get_relevant_documents(question)\n", "docs_tfidf[0]"]}, {"cell_type": "code", "execution_count": 78, "id": "5ade9c87", "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "svm_retriever_chinese = SVMRetriever.from_texts(splits_chinese, embedding)\n", "tfidf_retriever_chinese = TFIDFRetriever.from_texts(splits_chinese)"]}, {"cell_type": "code", "execution_count": 79, "id": "cc823bea", "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content='fig, ax = plt.subplots() \\n# step4 \u7ed8\u5236\u56fe\u50cf\uff0c \u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u2f06\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.plot(x, y, label=\\'linear\\') \\n# step5 \u6dfb\u52a0\u6807\u7b7e\uff0c\u2f42\u5b57\u548c\u56fe\u4f8b\uff0c\u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u56db\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.set_xlabel(\\'x label\\') \\nax.set_ylabel(\\'y label\\') \\nax.set_title(\"Simple Plot\") \\nax.legend() ;\\n\u601d\u8003\u9898\\n\u8bf7\u601d\u8003\u4e24\u79cd\u7ed8\u56fe\u6a21\u5f0f\u7684\u4f18\u7f3a\u70b9\u548c\u5404\u2f83\u9002\u5408\u7684\u4f7f\u2f64\u573a\u666f\\n\u5728\u7b2c\u4e94\u8282\u7ed8\u56fe\u6a21\u677f\u4e2d\u6211\u4eec\u662f\u4ee5 OO \u6a21\u5f0f\u4f5c\u4e3a\u4f8b\u2f26\u5c55\u793a\u7684\uff0c\u8bf7\u601d\u8003\u5e76\u5199\u2f00\u4e2a pyplot \u7ed8\u56fe\u6a21\u5f0f\u7684\u7b80\u5355\u6a21\u677f', metadata={})"]}, "execution_count": 79, "metadata": {}, "output_type": "execute_result"}], "source": ["question_chinese = \"\u8fd9\u95e8\u8bfe\u7684\u4e3b\u8981\u4e3b\u9898\u662f\u4ec0\u4e48\uff1f\" \n", "docs_svm_chinese = svm_retriever_chinese.get_relevant_documents(question_chinese)\n", "docs_svm_chinese[0]"]}, {"cell_type": "code", "execution_count": 80, "id": "01eb9d43", "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content='fig, ax = plt.subplots() \\n# step4 \u7ed8\u5236\u56fe\u50cf\uff0c \u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u2f06\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.plot(x, y, label=\\'linear\\') \\n# step5 \u6dfb\u52a0\u6807\u7b7e\uff0c\u2f42\u5b57\u548c\u56fe\u4f8b\uff0c\u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u56db\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.set_xlabel(\\'x label\\') \\nax.set_ylabel(\\'y label\\') \\nax.set_title(\"Simple Plot\") \\nax.legend() ;\\n\u601d\u8003\u9898\\n\u8bf7\u601d\u8003\u4e24\u79cd\u7ed8\u56fe\u6a21\u5f0f\u7684\u4f18\u7f3a\u70b9\u548c\u5404\u2f83\u9002\u5408\u7684\u4f7f\u2f64\u573a\u666f\\n\u5728\u7b2c\u4e94\u8282\u7ed8\u56fe\u6a21\u677f\u4e2d\u6211\u4eec\u662f\u4ee5 OO \u6a21\u5f0f\u4f5c\u4e3a\u4f8b\u2f26\u5c55\u793a\u7684\uff0c\u8bf7\u601d\u8003\u5e76\u5199\u2f00\u4e2a pyplot \u7ed8\u56fe\u6a21\u5f0f\u7684\u7b80\u5355\u6a21\u677f', metadata={})"]}, "execution_count": 80, "metadata": {}, "output_type": "execute_result"}], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "docs_tfidf_chinese = tfidf_retriever_chinese.get_relevant_documents(question_chinese)\n", "docs_tfidf_chinese[0]"]}], "metadata": {"kernelspec": {"display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11"}}, "nbformat": 4, "nbformat_minor": 5} \ No newline at end of file diff --git a/content/LangChain Chat with Your Data/6.问答 Question Answering.ipynb b/content/LangChain Chat with Your Data/6.问答 Question Answering.ipynb index 6d35fb2..92b6812 100644 --- a/content/LangChain Chat with Your Data/6.问答 Question Answering.ipynb +++ b/content/LangChain Chat with Your Data/6.问答 Question Answering.ipynb @@ -1 +1 @@ -{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# \u7b2c\u516d\u7ae0 \u95ee\u7b54", "\n", " - [\u4e00\u3001\u5f15\u8a00](#\u4e00\u3001\u5f15\u8a00)\n", " - [\u4e8c\u3001\u73af\u5883\u914d\u7f6e](#\u4e8c\u3001\u73af\u5883\u914d\u7f6e)\n", " - [\u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93](#\u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93)\n", " - [\u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde](#\u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde)\n", " - [\u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#\u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.1 \u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.1-\u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.2 \u57fa\u4e8e MapReduce \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.2-\u57fa\u4e8e-MapReduce-\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.3 \u57fa\u4e8e Refine \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.3-\u57fa\u4e8e-Refine-\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [\u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55](#\u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55)\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e00\u3001\u5f15\u8a00\n", "\n", "\n", "\u5728\u4e0a\u4e00\u7ae0\uff0c\u6211\u4eec\u5df2\u7ecf\u8ba8\u8bba\u4e86\u5982\u4f55\u68c0\u7d22\u4e0e\u7ed9\u5b9a\u95ee\u9898\u76f8\u5173\u7684\u6587\u6863\u3002\u4e0b\u4e00\u6b65\u662f\u83b7\u53d6\u8fd9\u4e9b\u6587\u6863\uff0c\u62ff\u5230\u539f\u59cb\u95ee\u9898\uff0c\u5c06\u5b83\u4eec\u4e00\u8d77\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u5e76\u8981\u6c42\u5b83\u56de\u7b54\u8fd9\u4e2a\u95ee\u9898\u3002\u5728\u672c\u8bfe\u7a0b\u4e2d\uff0c\u6211\u4eec\u5c06\u8be6\u7ec6\u4ecb\u7ecd\u8fd9\u4e00\u8fc7\u7a0b\uff0c\u4ee5\u53ca\u5b8c\u6210\u8fd9\u9879\u4efb\u52a1\u7684\u51e0\u79cd\u4e0d\u540c\u65b9\u6cd5\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u5df2\u7ecf\u5b8c\u6210\u4e86\u6574\u4e2a\u5b58\u50a8\u548c\u83b7\u53d6\uff0c\u83b7\u53d6\u4e86\u76f8\u5173\u7684\u5207\u5206\u6587\u6863\u4e4b\u540e\uff0c\u73b0\u5728\u6211\u4eec\u9700\u8981\u5c06\u5b83\u4eec\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u4ee5\u83b7\u5f97\u7b54\u6848\u3002\u8fd9\u4e2a\u8fc7\u7a0b\u7684\u4e00\u822c\u6d41\u7a0b\u5982\u4e0b\uff1a\u9996\u5148\u95ee\u9898\u88ab\u63d0\u51fa\uff0c\u7136\u540e\u6211\u4eec\u67e5\u627e\u76f8\u5173\u7684\u6587\u6863\uff0c\u63a5\u7740\u5c06\u8fd9\u4e9b\u5207\u5206\u6587\u6863\u548c\u7cfb\u7edf\u63d0\u793a\u4e00\u8d77\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u5e76\u83b7\u5f97\u7b54\u6848\u3002\n", "\n", "\u9ed8\u8ba4\u60c5\u51b5\u4e0b\uff0c\u6211\u4eec\u5c06\u6240\u6709\u7684\u6587\u6863\u5207\u7247\u90fd\u4f20\u9012\u5230\u540c\u4e00\u4e2a\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\uff0c\u5373\u540c\u4e00\u6b21\u8bed\u8a00\u6a21\u578b\u8c03\u7528\u4e2d\u3002\u7136\u800c\uff0c\u6709\u4e00\u4e9b\u4e0d\u540c\u7684\u65b9\u6cd5\u53ef\u4ee5\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u5b83\u4eec\u90fd\u6709\u4f18\u7f3a\u70b9\u3002\u5927\u90e8\u5206\u4f18\u70b9\u6765\u81ea\u4e8e\u6709\u65f6\u53ef\u80fd\u4f1a\u6709\u5f88\u591a\u6587\u6863\uff0c\u4f46\u4f60\u7b80\u5355\u5730\u65e0\u6cd5\u5c06\u5b83\u4eec\u5168\u90e8\u4f20\u9012\u5230\u540c\u4e00\u4e2a\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\u3002MapReduce\u3001Refine \u548c MapRerank \u662f\u4e09\u79cd\u65b9\u6cd5\uff0c\u7528\u4e8e\u89e3\u51b3\u8fd9\u4e2a\u77ed\u4e0a\u4e0b\u6587\u7a97\u53e3\u7684\u95ee\u9898\u3002\u6211\u4eec\u5c06\u5728\u8be5\u8bfe\u7a0b\u4e2d\u8fdb\u884c\u7b80\u8981\u4ecb\u7ecd\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e8c\u3001\u73af\u5883\u914d\u7f6e"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u914d\u7f6e\u73af\u5883\u65b9\u6cd5\u540c\u524d\uff0c\u6b64\u5904\u4e0d\u518d\u8d58\u8ff0"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import openai\n", "import sys\n", "sys.path.append('../..')\n", "\n", "from dotenv import load_dotenv, find_dotenv\n", "\n", "_ = load_dotenv(find_dotenv()) # read local .env file\n", "\n", "openai.api_key = os.environ['OPENAI_API_KEY']\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57282023\u5e749\u67082\u65e5\u4e4b\u540e\uff0cGPT-3.5 API \u4f1a\u8fdb\u884c\u66f4\u65b0\uff0c\u56e0\u6b64\u6b64\u5904\u9700\u8981\u8fdb\u884c\u4e00\u4e2a\u65f6\u95f4\u5224\u65ad"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["gpt-3.5-turbo-0301\n"]}], "source": ["import datetime\n", "current_date = datetime.datetime.now().date()\n", "if current_date < datetime.date(2023, 9, 2):\n", " llm_name = \"gpt-3.5-turbo-0301\"\n", "else:\n", " llm_name = \"gpt-3.5-turbo\"\n", "print(llm_name)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["# \u52a0\u8f7d\u5728\u4e4b\u524d\u5df2\u7ecf\u8fdb\u884c\u6301\u4e45\u5316\u7684\u5411\u91cf\u6570\u636e\u5e93\n", "from langchain.vectorstores import Chroma\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "persist_directory = 'docs/chroma'\n", "embedding = OpenAIEmbeddings()\n", "vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["209\n"]}], "source": ["# \u53ef\u4ee5\u770b\u89c1\u5305\u542b\u4e86\u6211\u4eec\u4e4b\u524d\u8fdb\u884c\u5206\u5272\u7684209\u4e2a\u6587\u6863\n", "print(vectordb._collection.count())"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:07<00:00, 7.17s/it]\n"]}], "source": ["'''\u7531\u4e8e\u76ee\u524d\u7f3a\u4e4f\u7b2c\u4e94\u7ae0\u68c0\u7d22\uff0c\u6682\u65f6\u521d\u59cb\u5316\u4e00\u4e2a\u5411\u91cf\u6570\u636e\u5e93\u4ee3\u66ff'''\n", "from langchain.document_loaders import PyPDFLoader\n", "\n", "# \u52a0\u8f7d PDF\n", "loaders = [\n", " # \u6545\u610f\u6dfb\u52a0\u91cd\u590d\u6587\u6863\uff0c\u4f7f\u6570\u636e\u6df7\u4e71\n", " PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\"),\n", " PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\"),\n", " PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture02.pdf\"),\n", " PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture03.pdf\")\n", "]\n", "docs = []\n", "for loader in loaders:\n", " docs.extend(loader.load())\n", "\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size = 1500, # \u6bcf\u4e2a\u6587\u672c\u5757\u7684\u5927\u5c0f\u3002\u8fd9\u610f\u5473\u7740\u6bcf\u6b21\u5207\u5206\u6587\u672c\u65f6\uff0c\u4f1a\u5c3d\u91cf\u4f7f\u6bcf\u4e2a\u5757\u5305\u542b 1500 \u4e2a\u5b57\u7b26\u3002\n", " chunk_overlap = 150 # \u6bcf\u4e2a\u6587\u672c\u5757\u4e4b\u95f4\u7684\u91cd\u53e0\u90e8\u5206\u3002\n", ")\n", "\n", "splits = text_splitter.split_documents(docs)\n", "\n", "\n", "vectordb = Chroma.from_documents(\n", " documents=splits,\n", " embedding=embedding,\n", " persist_directory=persist_directory # \u5141\u8bb8\u6211\u4eec\u5c06persist_directory\u76ee\u5f55\u4fdd\u5b58\u5230\u78c1\u76d8\u4e0a\n", ")"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u6d4b\u8bd5\u4e00\u4e0b\u5bf9\u4e8e\u4e00\u4e2a\u63d0\u95ee\u8fdb\u884c\u5411\u91cf\u68c0\u7d22\u3002\u5982\u4e0b\u4ee3\u7801\u4f1a\u5728\u5411\u91cf\u6570\u636e\u5e93\u4e2d\u6839\u636e\u76f8\u4f3c\u6027\u8fdb\u884c\u68c0\u7d22\uff0c\u8fd4\u56de\u7ed9\u4f60 k \u4e2a\u6587\u6863\u3002"]}, {"cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [{"data": {"text/plain": ["3"]}, "execution_count": 34, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"What are major topics for this class?\"\n", "docs = vectordb.similarity_search(question,k=3)\n", "len(docs)"]}, {"cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [{"data": {"text/plain": ["3"]}, "execution_count": 35, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u662f\u4ec0\u4e48\"\n", "docs = vectordb.similarity_search(question,k=3)\n", "len(docs)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57fa\u4e8e LangChain\uff0c\u6211\u4eec\u53ef\u4ee5\u6784\u9020\u4e00\u4e2a\u4f7f\u7528 GPT3.5 \u8fdb\u884c\u95ee\u7b54\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\uff0c\u8fd9\u662f\u4e00\u79cd\u901a\u8fc7\u68c0\u7d22\u6b65\u9aa4\u8fdb\u884c\u95ee\u7b54\u7684\u65b9\u6cd5\u3002\u6211\u4eec\u53ef\u4ee5\u901a\u8fc7\u4f20\u5165\u4e00\u4e2a\u8bed\u8a00\u6a21\u578b\u548c\u4e00\u4e2a\u5411\u91cf\u6570\u636e\u5e93\u6765\u521b\u5efa\u5b83\u4f5c\u4e3a\u68c0\u7d22\u5668\u3002\u7136\u540e\uff0c\u6211\u4eec\u53ef\u4ee5\u7528\u95ee\u9898\u4f5c\u4e3a\u67e5\u8be2\u8c03\u7528\u5b83\uff0c\u5f97\u5230\u4e00\u4e2a\u7b54\u6848\u3002"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": ["# \u4f7f\u7528 ChatGPT3.5\uff0c\u6e29\u5ea6\u8bbe\u7f6e\u4e3a0\n", "from langchain.chat_models import ChatOpenAI\n", "llm = ChatOpenAI(model_name=llm_name, temperature=0)"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["# \u5bfc\u5165\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\n", "from langchain.chains import RetrievalQA"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["# \u58f0\u660e\u4e00\u4e2a\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\n", "qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["# \u53ef\u4ee5\u4ee5\u8be5\u65b9\u5f0f\u8fdb\u884c\u68c0\u7d22\u95ee\u7b54\n", "question = \"What are major topics for this class?\"\n", "result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"data": {"text/plain": ["'The major topic for this class is machine learning. Additionally, there may be some discussion on statistics and algebra as a refresher, and later in the quarter, there may be some discussion on extensions for the material covered in the main lectures.'"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": ["# \u53ef\u4ee5\u4ee5\u8be5\u65b9\u5f0f\u8fdb\u884c\u68c0\u7d22\u95ee\u7b54\n", "question = \"\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u662f\u4ec0\u4e48\"\n", "result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\u4ece\u8fd9\u4e9b\u4e0a\u4e0b\u6587\u6765\u770b\uff0c\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u5305\u62ec\u8bfe\u7a0b\u4fe1\u606f\u3001\u5728\u7ebf\u8d44\u6e90\u548c\u7ebf\u6027\u4ee3\u6570\u3002\n"]}], "source": ["print(result[\"result\"])"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u901a\u8fc7\u4e0a\u8ff0\u4ee3\u7801\uff0c\u6211\u4eec\u53ef\u4ee5\u5b9e\u73b0\u4e00\u4e2a\u7b80\u5355\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\u3002\u63a5\u4e0b\u6765\uff0c\u8ba9\u6211\u4eec\u6df1\u5165\u5176\u4e2d\u7684\u7ec6\u8282\uff0c\u770b\u770b\u5728\u8fd9\u4e2a\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\u4e2d\uff0cLangChain \u90fd\u505a\u4e86\u4e9b\u4ec0\u4e48\u3002\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.1 \u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\n", "\u6211\u4eec\u9996\u5148\u5b9a\u4e49\u4e86\u4e00\u4e2a\u63d0\u793a\u6a21\u677f\u3002\u5b83\u5305\u542b\u4e00\u4e9b\u5173\u4e8e\u5982\u4f55\u4f7f\u7528\u4e0b\u9762\u7684\u4e0a\u4e0b\u6587\u7247\u6bb5\u7684\u8bf4\u660e\uff0c\u7136\u540e\u6709\u4e00\u4e2a\u4e0a\u4e0b\u6587\u53d8\u91cf\u7684\u5360\u4f4d\u7b26\u3002"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["from langchain.prompts import PromptTemplate\n", "\n", "# Build prompt\n", "template = \"\"\"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say \"thanks for asking!\" at the end of the answer. \n", "{context}\n", "Question: {question}\n", "Helpful Answer:\"\"\"\n", "QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n"]}, {"cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "from langchain.prompts import PromptTemplate\n", "\n", "# Build prompt\n", "template = \"\"\"\u4f7f\u7528\u4ee5\u4e0b\u4e0a\u4e0b\u6587\u7247\u6bb5\u6765\u56de\u7b54\u6700\u540e\u7684\u95ee\u9898\u3002\u5982\u679c\u4f60\u4e0d\u77e5\u9053\u7b54\u6848\uff0c\u53ea\u9700\u8bf4\u4e0d\u77e5\u9053\uff0c\u4e0d\u8981\u8bd5\u56fe\u7f16\u9020\u7b54\u6848\u3002\u7b54\u6848\u6700\u591a\u4f7f\u7528\u4e09\u4e2a\u53e5\u5b50\u3002\u5c3d\u91cf\u7b80\u660e\u627c\u8981\u5730\u56de\u7b54\u3002\u5728\u56de\u7b54\u7684\u6700\u540e\u4e00\u5b9a\u8981\u8bf4\"\u611f\u8c22\u60a8\u7684\u63d0\u95ee\uff01\"\n", "{context}\n", "\u95ee\u9898\uff1a{question}\n", "\u6709\u7528\u7684\u56de\u7b54\uff1a\"\"\"\n", "QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n"]}, {"cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": ["# Run chain\n", "qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " return_source_documents=True,\n", " chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT}\n", ")"]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": ["question = \"Is probability a class topic?\""]}, {"cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": ["result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Yes, probability is a class topic and the instructor assumes familiarity with basic probability and statistics.'"]}, "execution_count": 40, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "question = \"\u673a\u5668\u5b66\u4e60\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\""]}, {"cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": ["result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u662f\u7684\uff0c\u673a\u5668\u5b66\u4e60\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u3002\u611f\u8c22\u60a8\u7684\u63d0\u95ee\uff01'"]}, "execution_count": 49, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content=\"So in this class, we've tried to convey to you a broad set of principl es and tools that will \\nbe useful for doing many, many things. And ev ery time I teach this class, I can actually \\nvery confidently say that af ter December, no matter what yo u're going to do after this \\nDecember when you've sort of completed this class, you'll find the things you learn in \\nthis class very useful, and these things will be useful pretty much no matter what you end \\nup doing later in your life. \\nSo I have more logistics to go over later, but let's say a few more words about machine \\nlearning. I feel that machine learning grew out of early work in AI, early work in artificial \\nintelligence. And over the last \u2014 I wanna say last 15 or last 20 years or so, it's been viewed as a sort of growing new capability for computers. And in particular, it turns out \\nthat there are many programs or there are many applications that you can't program by \\nhand. \\nFor example, if you want to get a computer to read handwritten characters, to read sort of \\nhandwritten digits, that actual ly turns out to be amazingly difficult to write a piece of \\nsoftware to take this input, an image of some thing that I wrote and to figure out just what \\nit is, to translate my cursive handwriting into \u2014 to extract the characters I wrote out in \\nlonghand. And other things: One thing that my students and I do is autonomous flight. It \\nturns out to be extremely difficult to sit dow n and write a program to fly a helicopter.\", metadata={'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 2})"]}, "execution_count": 50, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"source_documents\"][0]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u8fd9\u79cd\u65b9\u6cd5\u975e\u5e38\u597d\uff0c\u56e0\u4e3a\u5b83\u53ea\u6d89\u53ca\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u4e00\u6b21\u8c03\u7528\u3002\u7136\u800c\uff0c\u5b83\u4e5f\u6709\u5c40\u9650\u6027\uff0c\u5373\u5982\u679c\u6587\u6863\u592a\u591a\uff0c\u53ef\u80fd\u65e0\u6cd5\u5c06\u5b83\u4eec\u5168\u90e8\u9002\u914d\u5230\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\u3002\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528\u53e6\u4e00\u79cd\u6280\u672f\u6765\u5bf9\u6587\u6863\u8fdb\u884c\u95ee\u7b54\uff0c\u5373MapReduce\u6280\u672f\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.2 \u57fa\u4e8e MapReduce \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u5728 MapReduce \u6280\u672f\u4e2d\uff0c\u9996\u5148\u5c06\u6bcf\u4e2a\u72ec\u7acb\u7684\u6587\u6863\u5355\u72ec\u53d1\u9001\u5230\u8bed\u8a00\u6a21\u578b\u4ee5\u83b7\u53d6\u539f\u59cb\u7b54\u6848\u3002\u7136\u540e\uff0c\u8fd9\u4e9b\u7b54\u6848\u901a\u8fc7\u6700\u7ec8\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u4e00\u6b21\u8c03\u7528\u7ec4\u5408\u6210\u6700\u7ec8\u7684\u7b54\u6848\u3002\u867d\u7136\u8fd9\u6837\u6d89\u53ca\u4e86\u66f4\u591a\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u8c03\u7528\uff0c\u4f46\u5b83\u7684\u4f18\u52bf\u5728\u4e8e\u53ef\u4ee5\u5904\u7406\u4efb\u610f\u6570\u91cf\u7684\u6587\u6863\u3002\n"]}, {"cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"map_reduce\"\n", ")"]}, {"cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": ["question = \"Is probability a class topic?\"\n", "result = qa_chain_mr({\"query\": question})"]}, {"cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [{"data": {"text/plain": ["'It is not clear from the given portion of the document whether probability is a class topic or not. The text only mentions that familiarity with basic probability and statistics is assumed as a prerequisite for the class.'"]}, "execution_count": 53, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u6839\u636e\u7ed9\u51fa\u7684\u6587\u4ef6\u90e8\u5206\uff0c\u6ca1\u6709\u63d0\u5230\u6982\u7387\u8bba\u3002'"]}, "execution_count": 55, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"map_reduce\"\n", ")\n", "# \u4e2d\u6587\u7248\n", "question = \"\u6982\u7387\u8bba\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u5f53\u6211\u4eec\u5c06\u4e4b\u524d\u7684\u95ee\u9898\u901a\u8fc7\u8fd9\u4e2a\u94fe\u8fdb\u884c\u8fd0\u884c\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u8fd9\u79cd\u65b9\u6cd5\u7684\u4e24\u4e2a\u95ee\u9898\u3002\u7b2c\u4e00\uff0c\u901f\u5ea6\u8981\u6162\u5f97\u591a\u3002\u7b2c\u4e8c\uff0c\u7ed3\u679c\u5b9e\u9645\u4e0a\u66f4\u5dee\u3002\u6839\u636e\u7ed9\u5b9a\u6587\u6863\u7684\u8fd9\u4e00\u90e8\u5206\uff0c\u5bf9\u8fd9\u4e2a\u95ee\u9898\u5e76\u6ca1\u6709\u660e\u786e\u7684\u7b54\u6848\u3002\u8fd9\u53ef\u80fd\u662f\u56e0\u4e3a\u5b83\u662f\u57fa\u4e8e\u6bcf\u4e2a\u6587\u6863\u5355\u72ec\u56de\u7b54\u7684\u3002\u56e0\u6b64\uff0c\u5982\u679c\u4fe1\u606f\u5206\u5e03\u5728\u4e24\u4e2a\u6587\u6863\u4e4b\u95f4\uff0c\u5b83\u5e76\u6ca1\u6709\u5728\u540c\u4e00\u4e0a\u4e0b\u6587\u4e2d\u83b7\u53d6\u5230\u6240\u6709\u7684\u4fe1\u606f\u3002"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["#import os\n", "#os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", "#os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"\n", "#os.environ[\"LANGCHAIN_API_KEY\"] = \"...\" # replace dots with your api key"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u5bfc\u5165\u4e0a\u8ff0\u73af\u5883\u53d8\u91cf\uff0c\u7136\u540e\u63a2\u5bfb MapReduce \u6587\u6863\u94fe\u7684\u7ec6\u8282\u3002\u4f8b\u5982\uff0c\u4e0a\u8ff0\u6f14\u793a\u4e2d\uff0c\u6211\u4eec\u5b9e\u9645\u4e0a\u6d89\u53ca\u4e86\u56db\u4e2a\u5355\u72ec\u7684\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u8c03\u7528\u3002\u5728\u8fd0\u884c\u5b8c\u6bcf\u4e2a\u6587\u6863\u540e\uff0c\u5b83\u4eec\u4f1a\u5728\u6700\u7ec8\u94fe\u5f0f\u4e2d\u7ec4\u5408\u5728\u4e00\u8d77\uff0c\u5373Stuffed Documents\u94fe\uff0c\u5c06\u6240\u6709\u8fd9\u4e9b\u56de\u7b54\u5408\u5e76\u5230\u6700\u7ec8\u7684\u8c03\u7528\u4e2d\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.3 \u57fa\u4e8e Refine \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u7c7b\u4f3c\u5730\u8bbe\u7f6e\u94fe\u5f0f\u7c7b\u578b\u4e3aRefine\u3002\u8fd9\u662f\u4e00\u79cd\u65b0\u7684\u94fe\u5f0f\u7c7b\u578b\u3002Refine \u6587\u6863\u94fe\u7c7b\u4f3c\u4e8e MapReduce \u94fe\uff0c\u5bf9\u4e8e\u6bcf\u4e00\u4e2a\u6587\u6863\uff0c\u4f1a\u8c03\u7528\u4e00\u6b21 LLM\uff0c\u4f46\u6709\u6240\u6539\u8fdb\u7684\u662f\uff0c\u6211\u4eec\u6bcf\u6b21\u53d1\u9001\u7ed9 LLM \u7684\u6700\u7ec8\u63d0\u793a\u662f\u4e00\u4e2a\u5e8f\u5217\uff0c\u8fd9\u4e2a\u5e8f\u5217\u4f1a\u5c06\u5148\u524d\u7684\u54cd\u5e94\u4e0e\u65b0\u6570\u636e\u7ed3\u5408\u5728\u4e00\u8d77\uff0c\u5e76\u8bf7\u6c42\u5f97\u5230\u6539\u8fdb\u540e\u7684\u54cd\u5e94\u3002\u56e0\u6b64\uff0c\u8fd9\u662f\u4e00\u79cd\u7c7b\u4f3c\u4e8e RNN \u7684\u6982\u5ff5\uff0c\u6211\u4eec\u589e\u5f3a\u4e86\u4e0a\u4e0b\u6587\uff0c\u4ece\u800c\u89e3\u51b3\u4fe1\u606f\u5206\u5e03\u5728\u4e0d\u540c\u6587\u6863\u7684\u95ee\u9898\u3002"]}, {"cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Based on the additional context provided, probability is assumed to be a prerequisite and not a main topic of the class. The instructor assumes that students are familiar with basic probability and statistics, including random variables, expectation, variance, and basic linear algebra. The class will not be very programming-intensive, but some programming will be done in MATLAB or Octave. The instructor will provide a refresher course on the prerequisites in some of the discussion sections. The class also assumes familiarity with basic linear algebra, including matrices, vectors, matrix multiplication, and matrix inverse. Most undergraduate linear algebra courses, such as Math 51, 103, Math 113, or CS205 at Stanford, are more than enough. The instructor will also review the prerequisites in some of the discussion sections.'"]}, "execution_count": 56, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"refine\"\n", ")\n", "question = \"Is probability a class topic?\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Based on the additional context provided, the instructor mentions that they will cover statistics and algebra in the discussion sections as a refresher, and will also use the discussion sections to go over extensions of the material taught in the main lectures. However, there is no explicit mention of probability theory being covered in the course. Therefore, the original answer still stands.'"]}, "execution_count": 57, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"refine\"\n", ")\n", "question = \"\u6982\u7387\u8bba\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u4f60\u4f1a\u6ce8\u610f\u5230\uff0c\u8fd9\u4e2a\u7ed3\u679c\u6bd4MapReduce\u94fe\u7684\u7ed3\u679c\u8981\u597d\u3002\u8fd9\u662f\u56e0\u4e3a\u4f7f\u7528Refined Chain\u5141\u8bb8\u4f60\u9010\u4e2a\u5730\u7ec4\u5408\u4fe1\u606f\uff0c\u5b9e\u9645\u4e0a\u6bd4MapReduce\u94fe\u9f13\u52b1\u66f4\u591a\u7684\u4fe1\u606f\u4f20\u9012\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u5728\u8fd9\u91cc\u505a\u4e00\u4e2a\u5b9e\u9a8c\u3002\n", "\n", "\u6211\u4eec\u5c06\u521b\u5efa\u4e00\u4e2aQA\u94fe\uff0c\u4f7f\u7528\u9ed8\u8ba4\u7684stuff\u3002\u8ba9\u6211\u4eec\u95ee\u4e00\u4e2a\u95ee\u9898\uff0c\u6982\u7387\u8bba\u662f\u8bfe\u7a0b\u7684\u4e3b\u9898\u5417\uff1f\u5b83\u4f1a\u56de\u7b54\uff0c\u6982\u7387\u8bba\u5e94\u8be5\u662f\u5148\u51b3\u6761\u4ef6\u3002\u6211\u4eec\u5c06\u8ffd\u95ee\uff0c\u4e3a\u4ec0\u4e48\u9700\u8981\u8fd9\u4e9b\u5148\u51b3\u6761\u4ef6\uff1f\u7136\u540e\u6211\u4eec\u5f97\u5230\u4e86\u4e00\u4e2a\u7b54\u6848\u3002\u8fd9\u95e8\u8bfe\u7684\u5148\u51b3\u6761\u4ef6\u662f\u5047\u5b9a\u5177\u6709\u8ba1\u7b97\u673a\u79d1\u5b66\u548c\u57fa\u672c\u8ba1\u7b97\u673a\u6280\u80fd\u548c\u539f\u7406\u7684\u57fa\u672c\u77e5\u8bc6\u3002\u8fd9\u4e0e\u4e4b\u524d\u95ee\u6709\u5173\u6982\u7387\u7684\u95ee\u9898\u6beb\u4e0d\u76f8\u5173\u3002"]}, {"cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": ["qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Yes, probability is a topic in this class. The speaker assumes that students have familiarity with basic probability and statistics, and mentions that most undergraduate statistics classes will be more than enough preparation for this class.'"]}, "execution_count": 59, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"Is probability a class topic?\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [{"data": {"text/plain": ["'The prerequisites are needed because in this class, the instructor assumes that all students have a basic knowledge of computer science and knowledge of basic computer skills and principles. This includes understanding of big-O notation and other fundamental concepts. Without this basic knowledge, it may be difficult to understand the material covered in the class.'"]}, "execution_count": 60, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"why are those prerequesites needed?\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u662f\u7684\uff0c\u4f5c\u8005\u5728\u6587\u4e2d\u63d0\u5230\u4e86\u8fd9\u95e8\u8bfe\u7a0b\u9700\u8981\u5b66\u751f\u5177\u5907\u57fa\u672c\u7684\u6982\u7387\u8bba\u548c\u7edf\u8ba1\u5b66\u77e5\u8bc6\u3002'"]}, "execution_count": 62, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u6982\u7387\u8bba\u662f\u8fd9\u8282\u8bfe\u7684\u4e00\u4e2a\u5185\u5bb9\u5417\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u5728\u8fd9\u6bb5\u4e0a\u4e0b\u6587\u4e2d\uff0c\u4f5c\u8005\u63d0\u5230\u8fd9\u4e9b\u77e5\u8bc6\u662f\u8fd9\u95e8\u8bfe\u7a0b\u7684\u5148\u51b3\u6761\u4ef6\uff0c\u56e0\u4e3a\u8fd9\u95e8\u8bfe\u7a0b\u6d89\u53ca\u5230\u673a\u5668\u5b66\u4e60\u7684\u57fa\u672c\u6982\u5ff5\u548c\u7b97\u6cd5\uff0c\u9700\u8981\u5b66\u751f\u5177\u5907\u8ba1\u7b97\u673a\u79d1\u5b66\u548c\u57fa\u672c\u8ba1\u7b97\u673a\u6280\u80fd\u548c\u539f\u7406\u7684\u57fa\u672c\u77e5\u8bc6\u3002\u5982\u679c\u5b66\u751f\u6ca1\u6709\u8fd9\u4e9b\u57fa\u7840\u77e5\u8bc6\uff0c\u53ef\u80fd\u4f1a\u5f88\u96be\u7406\u89e3\u548c\u5e94\u7528\u673a\u5668\u5b66\u4e60\u7b97\u6cd5\u3002\u56e0\u6b64\uff0c\u5b66\u751f\u9700\u8981\u5177\u5907\u8fd9\u4e9b\u77e5\u8bc6\u624d\u80fd\u66f4\u597d\u5730\u5b66\u4e60\u548c\u5e94\u7528\u673a\u5668\u5b66\u4e60\u3002'"]}, "execution_count": 63, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u4e3a\u4ec0\u4e48\u9700\u8981\u5177\u5907\u8fd9\u4e9b\u77e5\u8bc6\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57fa\u672c\u4e0a\uff0c\u6211\u4eec\u4f7f\u7528\u7684\u94fe\u5f0f\uff08chain\uff09\u6ca1\u6709\u4efb\u4f55\u72b6\u6001\u7684\u6982\u5ff5\u3002\u5b83\u4e0d\u8bb0\u5f97\u4e4b\u524d\u7684\u95ee\u9898\u6216\u4e4b\u524d\u7684\u7b54\u6848\u3002\u4e3a\u4e86\u5b9e\u73b0\u8fd9\u4e00\u70b9\uff0c\u6211\u4eec\u9700\u8981\u5f15\u5165\u5185\u5b58\uff0c\u8fd9\u662f\u6211\u4eec\u5c06\u5728\u4e0b\u4e00\u8282\u4e2d\u8ba8\u8bba\u7684\u5185\u5bb9\u3002"]}], "metadata": {"kernelspec": {"display_name": "gpt", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11"}, "orig_nbformat": 4}, "nbformat": 4, "nbformat_minor": 2} \ No newline at end of file +{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# \u7b2c\u516d\u7ae0 \u95ee\u7b54\n", "\n", " - [\u4e00\u3001\u5f15\u8a00](#\u4e00\u3001\u5f15\u8a00)\n", " - [\u4e8c\u3001\u73af\u5883\u914d\u7f6e](#\u4e8c\u3001\u73af\u5883\u914d\u7f6e)\n", " - [\u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93](#\u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93)\n", " - [\u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde](#\u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde)\n", " - [\u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#\u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.1 \u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.1-\u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.2 \u57fa\u4e8e MapReduce \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.2-\u57fa\u4e8e-MapReduce-\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.3 \u57fa\u4e8e Refine \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.3-\u57fa\u4e8e-Refine-\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [\u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55](#\u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55)\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e00\u3001\u5f15\u8a00\n", "\n", "\n", "\u5728\u4e0a\u4e00\u7ae0\uff0c\u6211\u4eec\u5df2\u7ecf\u8ba8\u8bba\u4e86\u5982\u4f55\u68c0\u7d22\u4e0e\u7ed9\u5b9a\u95ee\u9898\u76f8\u5173\u7684\u6587\u6863\u3002\u4e0b\u4e00\u6b65\u662f\u83b7\u53d6\u8fd9\u4e9b\u6587\u6863\uff0c\u62ff\u5230\u539f\u59cb\u95ee\u9898\uff0c\u5c06\u5b83\u4eec\u4e00\u8d77\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u5e76\u8981\u6c42\u5b83\u56de\u7b54\u8fd9\u4e2a\u95ee\u9898\u3002\u5728\u672c\u8bfe\u7a0b\u4e2d\uff0c\u6211\u4eec\u5c06\u8be6\u7ec6\u4ecb\u7ecd\u8fd9\u4e00\u8fc7\u7a0b\uff0c\u4ee5\u53ca\u5b8c\u6210\u8fd9\u9879\u4efb\u52a1\u7684\u51e0\u79cd\u4e0d\u540c\u65b9\u6cd5\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u5df2\u7ecf\u5b8c\u6210\u4e86\u6574\u4e2a\u5b58\u50a8\u548c\u83b7\u53d6\uff0c\u83b7\u53d6\u4e86\u76f8\u5173\u7684\u5207\u5206\u6587\u6863\u4e4b\u540e\uff0c\u73b0\u5728\u6211\u4eec\u9700\u8981\u5c06\u5b83\u4eec\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u4ee5\u83b7\u5f97\u7b54\u6848\u3002\u8fd9\u4e2a\u8fc7\u7a0b\u7684\u4e00\u822c\u6d41\u7a0b\u5982\u4e0b\uff1a\u9996\u5148\u95ee\u9898\u88ab\u63d0\u51fa\uff0c\u7136\u540e\u6211\u4eec\u67e5\u627e\u76f8\u5173\u7684\u6587\u6863\uff0c\u63a5\u7740\u5c06\u8fd9\u4e9b\u5207\u5206\u6587\u6863\u548c\u7cfb\u7edf\u63d0\u793a\u4e00\u8d77\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u5e76\u83b7\u5f97\u7b54\u6848\u3002\n", "\n", "\u9ed8\u8ba4\u60c5\u51b5\u4e0b\uff0c\u6211\u4eec\u5c06\u6240\u6709\u7684\u6587\u6863\u5207\u7247\u90fd\u4f20\u9012\u5230\u540c\u4e00\u4e2a\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\uff0c\u5373\u540c\u4e00\u6b21\u8bed\u8a00\u6a21\u578b\u8c03\u7528\u4e2d\u3002\u7136\u800c\uff0c\u6709\u4e00\u4e9b\u4e0d\u540c\u7684\u65b9\u6cd5\u53ef\u4ee5\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u5b83\u4eec\u90fd\u6709\u4f18\u7f3a\u70b9\u3002\u5927\u90e8\u5206\u4f18\u70b9\u6765\u81ea\u4e8e\u6709\u65f6\u53ef\u80fd\u4f1a\u6709\u5f88\u591a\u6587\u6863\uff0c\u4f46\u4f60\u7b80\u5355\u5730\u65e0\u6cd5\u5c06\u5b83\u4eec\u5168\u90e8\u4f20\u9012\u5230\u540c\u4e00\u4e2a\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\u3002MapReduce\u3001Refine \u548c MapRerank \u662f\u4e09\u79cd\u65b9\u6cd5\uff0c\u7528\u4e8e\u89e3\u51b3\u8fd9\u4e2a\u77ed\u4e0a\u4e0b\u6587\u7a97\u53e3\u7684\u95ee\u9898\u3002\u6211\u4eec\u5c06\u5728\u8be5\u8bfe\u7a0b\u4e2d\u8fdb\u884c\u7b80\u8981\u4ecb\u7ecd\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e8c\u3001\u73af\u5883\u914d\u7f6e"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u914d\u7f6e\u73af\u5883\u65b9\u6cd5\u540c\u524d\uff0c\u6b64\u5904\u4e0d\u518d\u8d58\u8ff0"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import openai\n", "import sys\n", "sys.path.append('../..')\n", "\n", "from dotenv import load_dotenv, find_dotenv\n", "\n", "_ = load_dotenv(find_dotenv()) # read local .env file\n", "\n", "openai.api_key = os.environ['OPENAI_API_KEY']\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57282023\u5e749\u67082\u65e5\u4e4b\u540e\uff0cGPT-3.5 API \u4f1a\u8fdb\u884c\u66f4\u65b0\uff0c\u56e0\u6b64\u6b64\u5904\u9700\u8981\u8fdb\u884c\u4e00\u4e2a\u65f6\u95f4\u5224\u65ad"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["gpt-3.5-turbo-0301\n"]}], "source": ["import datetime\n", "current_date = datetime.datetime.now().date()\n", "if current_date < datetime.date(2023, 9, 2):\n", " llm_name = \"gpt-3.5-turbo-0301\"\n", "else:\n", " llm_name = \"gpt-3.5-turbo\"\n", "print(llm_name)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["# \u52a0\u8f7d\u5728\u4e4b\u524d\u5df2\u7ecf\u8fdb\u884c\u6301\u4e45\u5316\u7684\u5411\u91cf\u6570\u636e\u5e93\n", "from langchain.vectorstores import Chroma\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "persist_directory = 'docs/chroma/cs229_lectures/'\n", "embedding = OpenAIEmbeddings()\n", "vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["209\n"]}], "source": ["# \u53ef\u4ee5\u770b\u89c1\u5305\u542b\u4e86\u6211\u4eec\u4e4b\u524d\u8fdb\u884c\u5206\u5272\u7684209\u4e2a\u6587\u6863\n", "print(vectordb._collection.count())"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u6d4b\u8bd5\u4e00\u4e0b\u5bf9\u4e8e\u4e00\u4e2a\u63d0\u95ee\u8fdb\u884c\u5411\u91cf\u68c0\u7d22\u3002\u5982\u4e0b\u4ee3\u7801\u4f1a\u5728\u5411\u91cf\u6570\u636e\u5e93\u4e2d\u6839\u636e\u76f8\u4f3c\u6027\u8fdb\u884c\u68c0\u7d22\uff0c\u8fd4\u56de\u7ed9\u4f60 k \u4e2a\u6587\u6863\u3002"]}, {"cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [{"data": {"text/plain": ["3"]}, "execution_count": 34, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"What are major topics for this class?\"\n", "docs = vectordb.similarity_search(question,k=3)\n", "len(docs)"]}, {"cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [{"data": {"text/plain": ["3"]}, "execution_count": 35, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u662f\u4ec0\u4e48\"\n", "docs = vectordb.similarity_search(question,k=3)\n", "len(docs)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57fa\u4e8e LangChain\uff0c\u6211\u4eec\u53ef\u4ee5\u6784\u9020\u4e00\u4e2a\u4f7f\u7528 GPT3.5 \u8fdb\u884c\u95ee\u7b54\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\uff0c\u8fd9\u662f\u4e00\u79cd\u901a\u8fc7\u68c0\u7d22\u6b65\u9aa4\u8fdb\u884c\u95ee\u7b54\u7684\u65b9\u6cd5\u3002\u6211\u4eec\u53ef\u4ee5\u901a\u8fc7\u4f20\u5165\u4e00\u4e2a\u8bed\u8a00\u6a21\u578b\u548c\u4e00\u4e2a\u5411\u91cf\u6570\u636e\u5e93\u6765\u521b\u5efa\u5b83\u4f5c\u4e3a\u68c0\u7d22\u5668\u3002\u7136\u540e\uff0c\u6211\u4eec\u53ef\u4ee5\u7528\u95ee\u9898\u4f5c\u4e3a\u67e5\u8be2\u8c03\u7528\u5b83\uff0c\u5f97\u5230\u4e00\u4e2a\u7b54\u6848\u3002"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": ["# \u4f7f\u7528 ChatGPT3.5\uff0c\u6e29\u5ea6\u8bbe\u7f6e\u4e3a0\n", "from langchain.chat_models import ChatOpenAI\n", "llm = ChatOpenAI(model_name=llm_name, temperature=0)"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["# \u5bfc\u5165\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\n", "from langchain.chains import RetrievalQA"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["# \u58f0\u660e\u4e00\u4e2a\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\n", "qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["# \u53ef\u4ee5\u4ee5\u8be5\u65b9\u5f0f\u8fdb\u884c\u68c0\u7d22\u95ee\u7b54\n", "question = \"What are major topics for this class?\"\n", "result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"data": {"text/plain": ["'The major topic for this class is machine learning. Additionally, there may be some discussion on statistics and algebra as a refresher, and later in the quarter, there may be some discussion on extensions for the material covered in the main lectures.'"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": ["# \u53ef\u4ee5\u4ee5\u8be5\u65b9\u5f0f\u8fdb\u884c\u68c0\u7d22\u95ee\u7b54\n", "question = \"\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u662f\u4ec0\u4e48\"\n", "result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\u4ece\u8fd9\u4e9b\u4e0a\u4e0b\u6587\u6765\u770b\uff0c\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u5305\u62ec\u8bfe\u7a0b\u4fe1\u606f\u3001\u5728\u7ebf\u8d44\u6e90\u548c\u7ebf\u6027\u4ee3\u6570\u3002\n"]}], "source": ["print(result[\"result\"])"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u901a\u8fc7\u4e0a\u8ff0\u4ee3\u7801\uff0c\u6211\u4eec\u53ef\u4ee5\u5b9e\u73b0\u4e00\u4e2a\u7b80\u5355\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\u3002\u63a5\u4e0b\u6765\uff0c\u8ba9\u6211\u4eec\u6df1\u5165\u5176\u4e2d\u7684\u7ec6\u8282\uff0c\u770b\u770b\u5728\u8fd9\u4e2a\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\u4e2d\uff0cLangChain \u90fd\u505a\u4e86\u4e9b\u4ec0\u4e48\u3002\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.1 \u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\n", "\u6211\u4eec\u9996\u5148\u5b9a\u4e49\u4e86\u4e00\u4e2a\u63d0\u793a\u6a21\u677f\u3002\u5b83\u5305\u542b\u4e00\u4e9b\u5173\u4e8e\u5982\u4f55\u4f7f\u7528\u4e0b\u9762\u7684\u4e0a\u4e0b\u6587\u7247\u6bb5\u7684\u8bf4\u660e\uff0c\u7136\u540e\u6709\u4e00\u4e2a\u4e0a\u4e0b\u6587\u53d8\u91cf\u7684\u5360\u4f4d\u7b26\u3002"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["from langchain.prompts import PromptTemplate\n", "\n", "# Build prompt\n", "template = \"\"\"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say \"thanks for asking!\" at the end of the answer. \n", "{context}\n", "Question: {question}\n", "Helpful Answer:\"\"\"\n", "QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n"]}, {"cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "from langchain.prompts import PromptTemplate\n", "\n", "# Build prompt\n", "template = \"\"\"\u4f7f\u7528\u4ee5\u4e0b\u4e0a\u4e0b\u6587\u7247\u6bb5\u6765\u56de\u7b54\u6700\u540e\u7684\u95ee\u9898\u3002\u5982\u679c\u4f60\u4e0d\u77e5\u9053\u7b54\u6848\uff0c\u53ea\u9700\u8bf4\u4e0d\u77e5\u9053\uff0c\u4e0d\u8981\u8bd5\u56fe\u7f16\u9020\u7b54\u6848\u3002\u7b54\u6848\u6700\u591a\u4f7f\u7528\u4e09\u4e2a\u53e5\u5b50\u3002\u5c3d\u91cf\u7b80\u660e\u627c\u8981\u5730\u56de\u7b54\u3002\u5728\u56de\u7b54\u7684\u6700\u540e\u4e00\u5b9a\u8981\u8bf4\"\u611f\u8c22\u60a8\u7684\u63d0\u95ee\uff01\"\n", "{context}\n", "\u95ee\u9898\uff1a{question}\n", "\u6709\u7528\u7684\u56de\u7b54\uff1a\"\"\"\n", "QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n"]}, {"cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": ["# Run chain\n", "qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " return_source_documents=True,\n", " chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT}\n", ")"]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": ["question = \"Is probability a class topic?\""]}, {"cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": ["result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Yes, probability is a class topic and the instructor assumes familiarity with basic probability and statistics.'"]}, "execution_count": 40, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "question = \"\u673a\u5668\u5b66\u4e60\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\""]}, {"cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": ["result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u662f\u7684\uff0c\u673a\u5668\u5b66\u4e60\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u3002\u611f\u8c22\u60a8\u7684\u63d0\u95ee\uff01'"]}, "execution_count": 49, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content=\"So in this class, we've tried to convey to you a broad set of principl es and tools that will \\nbe useful for doing many, many things. And ev ery time I teach this class, I can actually \\nvery confidently say that af ter December, no matter what yo u're going to do after this \\nDecember when you've sort of completed this class, you'll find the things you learn in \\nthis class very useful, and these things will be useful pretty much no matter what you end \\nup doing later in your life. \\nSo I have more logistics to go over later, but let's say a few more words about machine \\nlearning. I feel that machine learning grew out of early work in AI, early work in artificial \\nintelligence. And over the last \u2014 I wanna say last 15 or last 20 years or so, it's been viewed as a sort of growing new capability for computers. And in particular, it turns out \\nthat there are many programs or there are many applications that you can't program by \\nhand. \\nFor example, if you want to get a computer to read handwritten characters, to read sort of \\nhandwritten digits, that actual ly turns out to be amazingly difficult to write a piece of \\nsoftware to take this input, an image of some thing that I wrote and to figure out just what \\nit is, to translate my cursive handwriting into \u2014 to extract the characters I wrote out in \\nlonghand. And other things: One thing that my students and I do is autonomous flight. It \\nturns out to be extremely difficult to sit dow n and write a program to fly a helicopter.\", metadata={'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 2})"]}, "execution_count": 50, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"source_documents\"][0]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u8fd9\u79cd\u65b9\u6cd5\u975e\u5e38\u597d\uff0c\u56e0\u4e3a\u5b83\u53ea\u6d89\u53ca\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u4e00\u6b21\u8c03\u7528\u3002\u7136\u800c\uff0c\u5b83\u4e5f\u6709\u5c40\u9650\u6027\uff0c\u5373\u5982\u679c\u6587\u6863\u592a\u591a\uff0c\u53ef\u80fd\u65e0\u6cd5\u5c06\u5b83\u4eec\u5168\u90e8\u9002\u914d\u5230\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\u3002\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528\u53e6\u4e00\u79cd\u6280\u672f\u6765\u5bf9\u6587\u6863\u8fdb\u884c\u95ee\u7b54\uff0c\u5373MapReduce\u6280\u672f\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.2 \u57fa\u4e8e MapReduce \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u5728 MapReduce \u6280\u672f\u4e2d\uff0c\u9996\u5148\u5c06\u6bcf\u4e2a\u72ec\u7acb\u7684\u6587\u6863\u5355\u72ec\u53d1\u9001\u5230\u8bed\u8a00\u6a21\u578b\u4ee5\u83b7\u53d6\u539f\u59cb\u7b54\u6848\u3002\u7136\u540e\uff0c\u8fd9\u4e9b\u7b54\u6848\u901a\u8fc7\u6700\u7ec8\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u4e00\u6b21\u8c03\u7528\u7ec4\u5408\u6210\u6700\u7ec8\u7684\u7b54\u6848\u3002\u867d\u7136\u8fd9\u6837\u6d89\u53ca\u4e86\u66f4\u591a\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u8c03\u7528\uff0c\u4f46\u5b83\u7684\u4f18\u52bf\u5728\u4e8e\u53ef\u4ee5\u5904\u7406\u4efb\u610f\u6570\u91cf\u7684\u6587\u6863\u3002\n"]}, {"cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"map_reduce\"\n", ")"]}, {"cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": ["question = \"Is probability a class topic?\"\n", "result = qa_chain_mr({\"query\": question})"]}, {"cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [{"data": {"text/plain": ["'It is not clear from the given portion of the document whether probability is a class topic or not. The text only mentions that familiarity with basic probability and statistics is assumed as a prerequisite for the class.'"]}, "execution_count": 53, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u6839\u636e\u7ed9\u51fa\u7684\u6587\u4ef6\u90e8\u5206\uff0c\u6ca1\u6709\u63d0\u5230\u6982\u7387\u8bba\u3002'"]}, "execution_count": 55, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"map_reduce\"\n", ")\n", "# \u4e2d\u6587\u7248\n", "question = \"\u6982\u7387\u8bba\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u5f53\u6211\u4eec\u5c06\u4e4b\u524d\u7684\u95ee\u9898\u901a\u8fc7\u8fd9\u4e2a\u94fe\u8fdb\u884c\u8fd0\u884c\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u8fd9\u79cd\u65b9\u6cd5\u7684\u4e24\u4e2a\u95ee\u9898\u3002\u7b2c\u4e00\uff0c\u901f\u5ea6\u8981\u6162\u5f97\u591a\u3002\u7b2c\u4e8c\uff0c\u7ed3\u679c\u5b9e\u9645\u4e0a\u66f4\u5dee\u3002\u6839\u636e\u7ed9\u5b9a\u6587\u6863\u7684\u8fd9\u4e00\u90e8\u5206\uff0c\u5bf9\u8fd9\u4e2a\u95ee\u9898\u5e76\u6ca1\u6709\u660e\u786e\u7684\u7b54\u6848\u3002\u8fd9\u53ef\u80fd\u662f\u56e0\u4e3a\u5b83\u662f\u57fa\u4e8e\u6bcf\u4e2a\u6587\u6863\u5355\u72ec\u56de\u7b54\u7684\u3002\u56e0\u6b64\uff0c\u5982\u679c\u4fe1\u606f\u5206\u5e03\u5728\u4e24\u4e2a\u6587\u6863\u4e4b\u95f4\uff0c\u5b83\u5e76\u6ca1\u6709\u5728\u540c\u4e00\u4e0a\u4e0b\u6587\u4e2d\u83b7\u53d6\u5230\u6240\u6709\u7684\u4fe1\u606f\u3002"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["#import os\n", "#os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", "#os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"\n", "#os.environ[\"LANGCHAIN_API_KEY\"] = \"...\" # replace dots with your api key"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u5bfc\u5165\u4e0a\u8ff0\u73af\u5883\u53d8\u91cf\uff0c\u7136\u540e\u63a2\u5bfb MapReduce \u6587\u6863\u94fe\u7684\u7ec6\u8282\u3002\u4f8b\u5982\uff0c\u4e0a\u8ff0\u6f14\u793a\u4e2d\uff0c\u6211\u4eec\u5b9e\u9645\u4e0a\u6d89\u53ca\u4e86\u56db\u4e2a\u5355\u72ec\u7684\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u8c03\u7528\u3002\u5728\u8fd0\u884c\u5b8c\u6bcf\u4e2a\u6587\u6863\u540e\uff0c\u5b83\u4eec\u4f1a\u5728\u6700\u7ec8\u94fe\u5f0f\u4e2d\u7ec4\u5408\u5728\u4e00\u8d77\uff0c\u5373Stuffed Documents\u94fe\uff0c\u5c06\u6240\u6709\u8fd9\u4e9b\u56de\u7b54\u5408\u5e76\u5230\u6700\u7ec8\u7684\u8c03\u7528\u4e2d\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.3 \u57fa\u4e8e Refine \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u7c7b\u4f3c\u5730\u8bbe\u7f6e\u94fe\u5f0f\u7c7b\u578b\u4e3aRefine\u3002\u8fd9\u662f\u4e00\u79cd\u65b0\u7684\u94fe\u5f0f\u7c7b\u578b\u3002Refine \u6587\u6863\u94fe\u7c7b\u4f3c\u4e8e MapReduce \u94fe\uff0c\u5bf9\u4e8e\u6bcf\u4e00\u4e2a\u6587\u6863\uff0c\u4f1a\u8c03\u7528\u4e00\u6b21 LLM\uff0c\u4f46\u6709\u6240\u6539\u8fdb\u7684\u662f\uff0c\u6211\u4eec\u6bcf\u6b21\u53d1\u9001\u7ed9 LLM \u7684\u6700\u7ec8\u63d0\u793a\u662f\u4e00\u4e2a\u5e8f\u5217\uff0c\u8fd9\u4e2a\u5e8f\u5217\u4f1a\u5c06\u5148\u524d\u7684\u54cd\u5e94\u4e0e\u65b0\u6570\u636e\u7ed3\u5408\u5728\u4e00\u8d77\uff0c\u5e76\u8bf7\u6c42\u5f97\u5230\u6539\u8fdb\u540e\u7684\u54cd\u5e94\u3002\u56e0\u6b64\uff0c\u8fd9\u662f\u4e00\u79cd\u7c7b\u4f3c\u4e8e RNN \u7684\u6982\u5ff5\uff0c\u6211\u4eec\u589e\u5f3a\u4e86\u4e0a\u4e0b\u6587\uff0c\u4ece\u800c\u89e3\u51b3\u4fe1\u606f\u5206\u5e03\u5728\u4e0d\u540c\u6587\u6863\u7684\u95ee\u9898\u3002"]}, {"cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Based on the additional context provided, probability is assumed to be a prerequisite and not a main topic of the class. The instructor assumes that students are familiar with basic probability and statistics, including random variables, expectation, variance, and basic linear algebra. The class will not be very programming-intensive, but some programming will be done in MATLAB or Octave. The instructor will provide a refresher course on the prerequisites in some of the discussion sections. The class also assumes familiarity with basic linear algebra, including matrices, vectors, matrix multiplication, and matrix inverse. Most undergraduate linear algebra courses, such as Math 51, 103, Math 113, or CS205 at Stanford, are more than enough. The instructor will also review the prerequisites in some of the discussion sections.'"]}, "execution_count": 56, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"refine\"\n", ")\n", "question = \"Is probability a class topic?\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Based on the additional context provided, the instructor mentions that they will cover statistics and algebra in the discussion sections as a refresher, and will also use the discussion sections to go over extensions of the material taught in the main lectures. However, there is no explicit mention of probability theory being covered in the course. Therefore, the original answer still stands.'"]}, "execution_count": 57, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"refine\"\n", ")\n", "question = \"\u6982\u7387\u8bba\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u4f60\u4f1a\u6ce8\u610f\u5230\uff0c\u8fd9\u4e2a\u7ed3\u679c\u6bd4MapReduce\u94fe\u7684\u7ed3\u679c\u8981\u597d\u3002\u8fd9\u662f\u56e0\u4e3a\u4f7f\u7528Refined Chain\u5141\u8bb8\u4f60\u9010\u4e2a\u5730\u7ec4\u5408\u4fe1\u606f\uff0c\u5b9e\u9645\u4e0a\u6bd4MapReduce\u94fe\u9f13\u52b1\u66f4\u591a\u7684\u4fe1\u606f\u4f20\u9012\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u5728\u8fd9\u91cc\u505a\u4e00\u4e2a\u5b9e\u9a8c\u3002\n", "\n", "\u6211\u4eec\u5c06\u521b\u5efa\u4e00\u4e2aQA\u94fe\uff0c\u4f7f\u7528\u9ed8\u8ba4\u7684stuff\u3002\u8ba9\u6211\u4eec\u95ee\u4e00\u4e2a\u95ee\u9898\uff0c\u6982\u7387\u8bba\u662f\u8bfe\u7a0b\u7684\u4e3b\u9898\u5417\uff1f\u5b83\u4f1a\u56de\u7b54\uff0c\u6982\u7387\u8bba\u5e94\u8be5\u662f\u5148\u51b3\u6761\u4ef6\u3002\u6211\u4eec\u5c06\u8ffd\u95ee\uff0c\u4e3a\u4ec0\u4e48\u9700\u8981\u8fd9\u4e9b\u5148\u51b3\u6761\u4ef6\uff1f\u7136\u540e\u6211\u4eec\u5f97\u5230\u4e86\u4e00\u4e2a\u7b54\u6848\u3002\u8fd9\u95e8\u8bfe\u7684\u5148\u51b3\u6761\u4ef6\u662f\u5047\u5b9a\u5177\u6709\u8ba1\u7b97\u673a\u79d1\u5b66\u548c\u57fa\u672c\u8ba1\u7b97\u673a\u6280\u80fd\u548c\u539f\u7406\u7684\u57fa\u672c\u77e5\u8bc6\u3002\u8fd9\u4e0e\u4e4b\u524d\u95ee\u6709\u5173\u6982\u7387\u7684\u95ee\u9898\u6beb\u4e0d\u76f8\u5173\u3002"]}, {"cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": ["qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Yes, probability is a topic in this class. The speaker assumes that students have familiarity with basic probability and statistics, and mentions that most undergraduate statistics classes will be more than enough preparation for this class.'"]}, "execution_count": 59, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"Is probability a class topic?\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [{"data": {"text/plain": ["'The prerequisites are needed because in this class, the instructor assumes that all students have a basic knowledge of computer science and knowledge of basic computer skills and principles. This includes understanding of big-O notation and other fundamental concepts. Without this basic knowledge, it may be difficult to understand the material covered in the class.'"]}, "execution_count": 60, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"why are those prerequesites needed?\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u662f\u7684\uff0c\u4f5c\u8005\u5728\u6587\u4e2d\u63d0\u5230\u4e86\u8fd9\u95e8\u8bfe\u7a0b\u9700\u8981\u5b66\u751f\u5177\u5907\u57fa\u672c\u7684\u6982\u7387\u8bba\u548c\u7edf\u8ba1\u5b66\u77e5\u8bc6\u3002'"]}, "execution_count": 62, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u6982\u7387\u8bba\u662f\u8fd9\u8282\u8bfe\u7684\u4e00\u4e2a\u5185\u5bb9\u5417\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u5728\u8fd9\u6bb5\u4e0a\u4e0b\u6587\u4e2d\uff0c\u4f5c\u8005\u63d0\u5230\u8fd9\u4e9b\u77e5\u8bc6\u662f\u8fd9\u95e8\u8bfe\u7a0b\u7684\u5148\u51b3\u6761\u4ef6\uff0c\u56e0\u4e3a\u8fd9\u95e8\u8bfe\u7a0b\u6d89\u53ca\u5230\u673a\u5668\u5b66\u4e60\u7684\u57fa\u672c\u6982\u5ff5\u548c\u7b97\u6cd5\uff0c\u9700\u8981\u5b66\u751f\u5177\u5907\u8ba1\u7b97\u673a\u79d1\u5b66\u548c\u57fa\u672c\u8ba1\u7b97\u673a\u6280\u80fd\u548c\u539f\u7406\u7684\u57fa\u672c\u77e5\u8bc6\u3002\u5982\u679c\u5b66\u751f\u6ca1\u6709\u8fd9\u4e9b\u57fa\u7840\u77e5\u8bc6\uff0c\u53ef\u80fd\u4f1a\u5f88\u96be\u7406\u89e3\u548c\u5e94\u7528\u673a\u5668\u5b66\u4e60\u7b97\u6cd5\u3002\u56e0\u6b64\uff0c\u5b66\u751f\u9700\u8981\u5177\u5907\u8fd9\u4e9b\u77e5\u8bc6\u624d\u80fd\u66f4\u597d\u5730\u5b66\u4e60\u548c\u5e94\u7528\u673a\u5668\u5b66\u4e60\u3002'"]}, "execution_count": 63, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u4e3a\u4ec0\u4e48\u9700\u8981\u5177\u5907\u8fd9\u4e9b\u77e5\u8bc6\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57fa\u672c\u4e0a\uff0c\u6211\u4eec\u4f7f\u7528\u7684\u94fe\u5f0f\uff08chain\uff09\u6ca1\u6709\u4efb\u4f55\u72b6\u6001\u7684\u6982\u5ff5\u3002\u5b83\u4e0d\u8bb0\u5f97\u4e4b\u524d\u7684\u95ee\u9898\u6216\u4e4b\u524d\u7684\u7b54\u6848\u3002\u4e3a\u4e86\u5b9e\u73b0\u8fd9\u4e00\u70b9\uff0c\u6211\u4eec\u9700\u8981\u5f15\u5165\u5185\u5b58\uff0c\u8fd9\u662f\u6211\u4eec\u5c06\u5728\u4e0b\u4e00\u8282\u4e2d\u8ba8\u8bba\u7684\u5185\u5bb9\u3002"]}], "metadata": {"kernelspec": {"display_name": "gpt", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11"}, "orig_nbformat": 4}, "nbformat": 4, "nbformat_minor": 2} \ No newline at end of file diff --git a/content/LangChain Chat with Your Data/do.py b/content/LangChain Chat with Your Data/do.py new file mode 100644 index 0000000..1400185 --- /dev/null +++ b/content/LangChain Chat with Your Data/do.py @@ -0,0 +1,29 @@ +import os +import codecs +import json + +def add_toc(ipynb_file): + f = codecs.open(ipynb_file, 'r') + source = f.read() + y = json.loads(source) + toc = ["\n"] + for item in y["cells"]: + if item["cell_type"]=='markdown': + item_start = item['source'][0].strip("\n") + if item_start.startswith("#"): + l = len(item_start.split()[0]) + if l<=3 and l>1: + name = " ".join(item_start.split(" ")[1:]) + tag = "-".join(item_start.split(" ")[1:]) + tab = " "*(l-2) + toc.append(f' {tab}- [{name}](#{tag})\n') + + y["cells"][0]['source']= y["cells"][0]['source'][0:1] + y["cells"][0]['source'].extend(toc) + f = codecs.open(ipynb_file, 'w') + f.write(json.dumps(y)) + f.close() + +for file in os.listdir("."): + if file.endswith("ipynb") and file[0].isdigit(): + add_toc(file) \ No newline at end of file diff --git a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/chroma-collections.parquet b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/chroma-collections.parquet new file mode 100644 index 0000000..733f3e1 Binary files /dev/null and b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/chroma-collections.parquet differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/chroma-embeddings.parquet b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/chroma-embeddings.parquet new file mode 100644 index 0000000..d4da979 Binary files /dev/null and b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/chroma-embeddings.parquet differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/id_to_uuid_77c71dc2-3023-464f-a329-90595320250f.pkl b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/id_to_uuid_77c71dc2-3023-464f-a329-90595320250f.pkl deleted file mode 100644 index 2d2601a..0000000 Binary files a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/id_to_uuid_77c71dc2-3023-464f-a329-90595320250f.pkl and /dev/null differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/id_to_uuid_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.pkl b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/id_to_uuid_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.pkl new file mode 100644 index 0000000..d91ee2f Binary files /dev/null and b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/id_to_uuid_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.pkl differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_77c71dc2-3023-464f-a329-90595320250f.bin b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.bin similarity index 86% rename from content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_77c71dc2-3023-464f-a329-90595320250f.bin rename to content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.bin index 3337fa8..f44e71e 100644 Binary files a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_77c71dc2-3023-464f-a329-90595320250f.bin and b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.bin differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_metadata_77c71dc2-3023-464f-a329-90595320250f.pkl b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_metadata_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.pkl similarity index 62% rename from content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_metadata_77c71dc2-3023-464f-a329-90595320250f.pkl rename to content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_metadata_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.pkl index 060383e..62d26aa 100644 Binary files a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_metadata_77c71dc2-3023-464f-a329-90595320250f.pkl and b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/index_metadata_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.pkl differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/uuid_to_id_77c71dc2-3023-464f-a329-90595320250f.pkl b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/uuid_to_id_77c71dc2-3023-464f-a329-90595320250f.pkl deleted file mode 100644 index 329622c..0000000 Binary files a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/uuid_to_id_77c71dc2-3023-464f-a329-90595320250f.pkl and /dev/null differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/uuid_to_id_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.pkl b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/uuid_to_id_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.pkl new file mode 100644 index 0000000..f9830e2 Binary files /dev/null and b/content/LangChain Chat with Your Data/docs/chroma/cs229_lectures/index/uuid_to_id_eb8f4e00-842c-4cfd-a16b-ac92826c96cc.pkl differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/matplotlib/chroma-collections.parquet b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/chroma-collections.parquet new file mode 100644 index 0000000..8be0680 Binary files /dev/null and b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/chroma-collections.parquet differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/matplotlib/chroma-embeddings.parquet b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/chroma-embeddings.parquet new file mode 100644 index 0000000..45bf83b Binary files /dev/null and b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/chroma-embeddings.parquet differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/id_to_uuid_96510571-d0cf-41be-8f06-e35c32c20f64.pkl b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/id_to_uuid_96510571-d0cf-41be-8f06-e35c32c20f64.pkl new file mode 100644 index 0000000..4284a90 Binary files /dev/null and b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/id_to_uuid_96510571-d0cf-41be-8f06-e35c32c20f64.pkl differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/id_to_uuid_a8528bee-9668-46ed-82ec-6795ea07159f.pkl b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/id_to_uuid_a8528bee-9668-46ed-82ec-6795ea07159f.pkl deleted file mode 100644 index eaf5622..0000000 Binary files a/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/id_to_uuid_a8528bee-9668-46ed-82ec-6795ea07159f.pkl and /dev/null differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/index_a8528bee-9668-46ed-82ec-6795ea07159f.bin b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/index_96510571-d0cf-41be-8f06-e35c32c20f64.bin similarity index 100% rename from content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/index_a8528bee-9668-46ed-82ec-6795ea07159f.bin rename to content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/index_96510571-d0cf-41be-8f06-e35c32c20f64.bin diff --git a/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/index_metadata_a8528bee-9668-46ed-82ec-6795ea07159f.pkl b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/index_metadata_96510571-d0cf-41be-8f06-e35c32c20f64.pkl similarity index 62% rename from content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/index_metadata_a8528bee-9668-46ed-82ec-6795ea07159f.pkl rename to content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/index_metadata_96510571-d0cf-41be-8f06-e35c32c20f64.pkl index 37f18fd..3ae9676 100644 Binary files a/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/index_metadata_a8528bee-9668-46ed-82ec-6795ea07159f.pkl and b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/index_metadata_96510571-d0cf-41be-8f06-e35c32c20f64.pkl differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/uuid_to_id_96510571-d0cf-41be-8f06-e35c32c20f64.pkl b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/uuid_to_id_96510571-d0cf-41be-8f06-e35c32c20f64.pkl new file mode 100644 index 0000000..a86c57c Binary files /dev/null and b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/uuid_to_id_96510571-d0cf-41be-8f06-e35c32c20f64.pkl differ diff --git a/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/uuid_to_id_a8528bee-9668-46ed-82ec-6795ea07159f.pkl b/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/uuid_to_id_a8528bee-9668-46ed-82ec-6795ea07159f.pkl deleted file mode 100644 index e3363a0..0000000 Binary files a/content/LangChain Chat with Your Data/docs/chroma/matplotlib/index/uuid_to_id_a8528bee-9668-46ed-82ec-6795ea07159f.pkl and /dev/null differ