1 line
29 KiB
Plaintext
1 line
29 KiB
Plaintext
{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# \u7b2c\u516d\u7ae0 \u95ee\u7b54\n", "\n", " - [\u4e00\u3001\u5f15\u8a00](#\u4e00\u3001\u5f15\u8a00)\n", " - [\u4e8c\u3001\u73af\u5883\u914d\u7f6e](#\u4e8c\u3001\u73af\u5883\u914d\u7f6e)\n", " - [\u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93](#\u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93)\n", " - [\u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde](#\u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde)\n", " - [\u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#\u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.1 \u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.1-\u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.2 \u57fa\u4e8e MapReduce \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.2-\u57fa\u4e8e-MapReduce-\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.3 \u57fa\u4e8e Refine \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.3-\u57fa\u4e8e-Refine-\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [\u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55](#\u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55)\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e00\u3001\u5f15\u8a00\n", "\n", "\n", "\u5728\u4e0a\u4e00\u7ae0\uff0c\u6211\u4eec\u5df2\u7ecf\u8ba8\u8bba\u4e86\u5982\u4f55\u68c0\u7d22\u4e0e\u7ed9\u5b9a\u95ee\u9898\u76f8\u5173\u7684\u6587\u6863\u3002\u4e0b\u4e00\u6b65\u662f\u83b7\u53d6\u8fd9\u4e9b\u6587\u6863\uff0c\u62ff\u5230\u539f\u59cb\u95ee\u9898\uff0c\u5c06\u5b83\u4eec\u4e00\u8d77\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u5e76\u8981\u6c42\u5b83\u56de\u7b54\u8fd9\u4e2a\u95ee\u9898\u3002\u5728\u672c\u8bfe\u7a0b\u4e2d\uff0c\u6211\u4eec\u5c06\u8be6\u7ec6\u4ecb\u7ecd\u8fd9\u4e00\u8fc7\u7a0b\uff0c\u4ee5\u53ca\u5b8c\u6210\u8fd9\u9879\u4efb\u52a1\u7684\u51e0\u79cd\u4e0d\u540c\u65b9\u6cd5\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u5df2\u7ecf\u5b8c\u6210\u4e86\u6574\u4e2a\u5b58\u50a8\u548c\u83b7\u53d6\uff0c\u83b7\u53d6\u4e86\u76f8\u5173\u7684\u5207\u5206\u6587\u6863\u4e4b\u540e\uff0c\u73b0\u5728\u6211\u4eec\u9700\u8981\u5c06\u5b83\u4eec\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u4ee5\u83b7\u5f97\u7b54\u6848\u3002\u8fd9\u4e2a\u8fc7\u7a0b\u7684\u4e00\u822c\u6d41\u7a0b\u5982\u4e0b\uff1a\u9996\u5148\u95ee\u9898\u88ab\u63d0\u51fa\uff0c\u7136\u540e\u6211\u4eec\u67e5\u627e\u76f8\u5173\u7684\u6587\u6863\uff0c\u63a5\u7740\u5c06\u8fd9\u4e9b\u5207\u5206\u6587\u6863\u548c\u7cfb\u7edf\u63d0\u793a\u4e00\u8d77\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u5e76\u83b7\u5f97\u7b54\u6848\u3002\n", "\n", "\u9ed8\u8ba4\u60c5\u51b5\u4e0b\uff0c\u6211\u4eec\u5c06\u6240\u6709\u7684\u6587\u6863\u5207\u7247\u90fd\u4f20\u9012\u5230\u540c\u4e00\u4e2a\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\uff0c\u5373\u540c\u4e00\u6b21\u8bed\u8a00\u6a21\u578b\u8c03\u7528\u4e2d\u3002\u7136\u800c\uff0c\u6709\u4e00\u4e9b\u4e0d\u540c\u7684\u65b9\u6cd5\u53ef\u4ee5\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u5b83\u4eec\u90fd\u6709\u4f18\u7f3a\u70b9\u3002\u5927\u90e8\u5206\u4f18\u70b9\u6765\u81ea\u4e8e\u6709\u65f6\u53ef\u80fd\u4f1a\u6709\u5f88\u591a\u6587\u6863\uff0c\u4f46\u4f60\u7b80\u5355\u5730\u65e0\u6cd5\u5c06\u5b83\u4eec\u5168\u90e8\u4f20\u9012\u5230\u540c\u4e00\u4e2a\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\u3002MapReduce\u3001Refine \u548c MapRerank \u662f\u4e09\u79cd\u65b9\u6cd5\uff0c\u7528\u4e8e\u89e3\u51b3\u8fd9\u4e2a\u77ed\u4e0a\u4e0b\u6587\u7a97\u53e3\u7684\u95ee\u9898\u3002\u6211\u4eec\u5c06\u5728\u8be5\u8bfe\u7a0b\u4e2d\u8fdb\u884c\u7b80\u8981\u4ecb\u7ecd\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e8c\u3001\u73af\u5883\u914d\u7f6e"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u914d\u7f6e\u73af\u5883\u65b9\u6cd5\u540c\u524d\uff0c\u6b64\u5904\u4e0d\u518d\u8d58\u8ff0"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import openai\n", "import sys\n", "sys.path.append('../..')\n", "\n", "from dotenv import load_dotenv, find_dotenv\n", "\n", "_ = load_dotenv(find_dotenv()) # read local .env file\n", "\n", "openai.api_key = os.environ['OPENAI_API_KEY']\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57282023\u5e749\u67082\u65e5\u4e4b\u540e\uff0cGPT-3.5 API \u4f1a\u8fdb\u884c\u66f4\u65b0\uff0c\u56e0\u6b64\u6b64\u5904\u9700\u8981\u8fdb\u884c\u4e00\u4e2a\u65f6\u95f4\u5224\u65ad"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["gpt-3.5-turbo-0301\n"]}], "source": ["import datetime\n", "current_date = datetime.datetime.now().date()\n", "if current_date < datetime.date(2023, 9, 2):\n", " llm_name = \"gpt-3.5-turbo-0301\"\n", "else:\n", " llm_name = \"gpt-3.5-turbo\"\n", "print(llm_name)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["# \u52a0\u8f7d\u5728\u4e4b\u524d\u5df2\u7ecf\u8fdb\u884c\u6301\u4e45\u5316\u7684\u5411\u91cf\u6570\u636e\u5e93\n", "from langchain.vectorstores import Chroma\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "persist_directory = 'docs/chroma/cs229_lectures/'\n", "embedding = OpenAIEmbeddings()\n", "vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["209\n"]}], "source": ["# \u53ef\u4ee5\u770b\u89c1\u5305\u542b\u4e86\u6211\u4eec\u4e4b\u524d\u8fdb\u884c\u5206\u5272\u7684209\u4e2a\u6587\u6863\n", "print(vectordb._collection.count())"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u6d4b\u8bd5\u4e00\u4e0b\u5bf9\u4e8e\u4e00\u4e2a\u63d0\u95ee\u8fdb\u884c\u5411\u91cf\u68c0\u7d22\u3002\u5982\u4e0b\u4ee3\u7801\u4f1a\u5728\u5411\u91cf\u6570\u636e\u5e93\u4e2d\u6839\u636e\u76f8\u4f3c\u6027\u8fdb\u884c\u68c0\u7d22\uff0c\u8fd4\u56de\u7ed9\u4f60 k \u4e2a\u6587\u6863\u3002"]}, {"cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [{"data": {"text/plain": ["3"]}, "execution_count": 34, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"What are major topics for this class?\"\n", "docs = vectordb.similarity_search(question,k=3)\n", "len(docs)"]}, {"cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [{"data": {"text/plain": ["3"]}, "execution_count": 35, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u662f\u4ec0\u4e48\"\n", "docs = vectordb.similarity_search(question,k=3)\n", "len(docs)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57fa\u4e8e LangChain\uff0c\u6211\u4eec\u53ef\u4ee5\u6784\u9020\u4e00\u4e2a\u4f7f\u7528 GPT3.5 \u8fdb\u884c\u95ee\u7b54\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\uff0c\u8fd9\u662f\u4e00\u79cd\u901a\u8fc7\u68c0\u7d22\u6b65\u9aa4\u8fdb\u884c\u95ee\u7b54\u7684\u65b9\u6cd5\u3002\u6211\u4eec\u53ef\u4ee5\u901a\u8fc7\u4f20\u5165\u4e00\u4e2a\u8bed\u8a00\u6a21\u578b\u548c\u4e00\u4e2a\u5411\u91cf\u6570\u636e\u5e93\u6765\u521b\u5efa\u5b83\u4f5c\u4e3a\u68c0\u7d22\u5668\u3002\u7136\u540e\uff0c\u6211\u4eec\u53ef\u4ee5\u7528\u95ee\u9898\u4f5c\u4e3a\u67e5\u8be2\u8c03\u7528\u5b83\uff0c\u5f97\u5230\u4e00\u4e2a\u7b54\u6848\u3002"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": ["# \u4f7f\u7528 ChatGPT3.5\uff0c\u6e29\u5ea6\u8bbe\u7f6e\u4e3a0\n", "from langchain.chat_models import ChatOpenAI\n", "llm = ChatOpenAI(model_name=llm_name, temperature=0)"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["# \u5bfc\u5165\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\n", "from langchain.chains import RetrievalQA"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["# \u58f0\u660e\u4e00\u4e2a\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\n", "qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["# \u53ef\u4ee5\u4ee5\u8be5\u65b9\u5f0f\u8fdb\u884c\u68c0\u7d22\u95ee\u7b54\n", "question = \"What are major topics for this class?\"\n", "result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"data": {"text/plain": ["'The major topic for this class is machine learning. Additionally, there may be some discussion on statistics and algebra as a refresher, and later in the quarter, there may be some discussion on extensions for the material covered in the main lectures.'"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": ["# \u53ef\u4ee5\u4ee5\u8be5\u65b9\u5f0f\u8fdb\u884c\u68c0\u7d22\u95ee\u7b54\n", "question = \"\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u662f\u4ec0\u4e48\"\n", "result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\u4ece\u8fd9\u4e9b\u4e0a\u4e0b\u6587\u6765\u770b\uff0c\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u5305\u62ec\u8bfe\u7a0b\u4fe1\u606f\u3001\u5728\u7ebf\u8d44\u6e90\u548c\u7ebf\u6027\u4ee3\u6570\u3002\n"]}], "source": ["print(result[\"result\"])"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u901a\u8fc7\u4e0a\u8ff0\u4ee3\u7801\uff0c\u6211\u4eec\u53ef\u4ee5\u5b9e\u73b0\u4e00\u4e2a\u7b80\u5355\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\u3002\u63a5\u4e0b\u6765\uff0c\u8ba9\u6211\u4eec\u6df1\u5165\u5176\u4e2d\u7684\u7ec6\u8282\uff0c\u770b\u770b\u5728\u8fd9\u4e2a\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\u4e2d\uff0cLangChain \u90fd\u505a\u4e86\u4e9b\u4ec0\u4e48\u3002\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.1 \u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\n", "\u6211\u4eec\u9996\u5148\u5b9a\u4e49\u4e86\u4e00\u4e2a\u63d0\u793a\u6a21\u677f\u3002\u5b83\u5305\u542b\u4e00\u4e9b\u5173\u4e8e\u5982\u4f55\u4f7f\u7528\u4e0b\u9762\u7684\u4e0a\u4e0b\u6587\u7247\u6bb5\u7684\u8bf4\u660e\uff0c\u7136\u540e\u6709\u4e00\u4e2a\u4e0a\u4e0b\u6587\u53d8\u91cf\u7684\u5360\u4f4d\u7b26\u3002"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["from langchain.prompts import PromptTemplate\n", "\n", "# Build prompt\n", "template = \"\"\"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say \"thanks for asking!\" at the end of the answer. \n", "{context}\n", "Question: {question}\n", "Helpful Answer:\"\"\"\n", "QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n"]}, {"cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "from langchain.prompts import PromptTemplate\n", "\n", "# Build prompt\n", "template = \"\"\"\u4f7f\u7528\u4ee5\u4e0b\u4e0a\u4e0b\u6587\u7247\u6bb5\u6765\u56de\u7b54\u6700\u540e\u7684\u95ee\u9898\u3002\u5982\u679c\u4f60\u4e0d\u77e5\u9053\u7b54\u6848\uff0c\u53ea\u9700\u8bf4\u4e0d\u77e5\u9053\uff0c\u4e0d\u8981\u8bd5\u56fe\u7f16\u9020\u7b54\u6848\u3002\u7b54\u6848\u6700\u591a\u4f7f\u7528\u4e09\u4e2a\u53e5\u5b50\u3002\u5c3d\u91cf\u7b80\u660e\u627c\u8981\u5730\u56de\u7b54\u3002\u5728\u56de\u7b54\u7684\u6700\u540e\u4e00\u5b9a\u8981\u8bf4\"\u611f\u8c22\u60a8\u7684\u63d0\u95ee\uff01\"\n", "{context}\n", "\u95ee\u9898\uff1a{question}\n", "\u6709\u7528\u7684\u56de\u7b54\uff1a\"\"\"\n", "QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n"]}, {"cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": ["# Run chain\n", "qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " return_source_documents=True,\n", " chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT}\n", ")"]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": ["question = \"Is probability a class topic?\""]}, {"cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": ["result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Yes, probability is a class topic and the instructor assumes familiarity with basic probability and statistics.'"]}, "execution_count": 40, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "question = \"\u673a\u5668\u5b66\u4e60\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\""]}, {"cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": ["result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u662f\u7684\uff0c\u673a\u5668\u5b66\u4e60\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u3002\u611f\u8c22\u60a8\u7684\u63d0\u95ee\uff01'"]}, "execution_count": 49, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content=\"So in this class, we've tried to convey to you a broad set of principl es and tools that will \\nbe useful for doing many, many things. And ev ery time I teach this class, I can actually \\nvery confidently say that af ter December, no matter what yo u're going to do after this \\nDecember when you've sort of completed this class, you'll find the things you learn in \\nthis class very useful, and these things will be useful pretty much no matter what you end \\nup doing later in your life. \\nSo I have more logistics to go over later, but let's say a few more words about machine \\nlearning. I feel that machine learning grew out of early work in AI, early work in artificial \\nintelligence. And over the last \u2014 I wanna say last 15 or last 20 years or so, it's been viewed as a sort of growing new capability for computers. And in particular, it turns out \\nthat there are many programs or there are many applications that you can't program by \\nhand. \\nFor example, if you want to get a computer to read handwritten characters, to read sort of \\nhandwritten digits, that actual ly turns out to be amazingly difficult to write a piece of \\nsoftware to take this input, an image of some thing that I wrote and to figure out just what \\nit is, to translate my cursive handwriting into \u2014 to extract the characters I wrote out in \\nlonghand. And other things: One thing that my students and I do is autonomous flight. It \\nturns out to be extremely difficult to sit dow n and write a program to fly a helicopter.\", metadata={'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 2})"]}, "execution_count": 50, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"source_documents\"][0]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u8fd9\u79cd\u65b9\u6cd5\u975e\u5e38\u597d\uff0c\u56e0\u4e3a\u5b83\u53ea\u6d89\u53ca\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u4e00\u6b21\u8c03\u7528\u3002\u7136\u800c\uff0c\u5b83\u4e5f\u6709\u5c40\u9650\u6027\uff0c\u5373\u5982\u679c\u6587\u6863\u592a\u591a\uff0c\u53ef\u80fd\u65e0\u6cd5\u5c06\u5b83\u4eec\u5168\u90e8\u9002\u914d\u5230\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\u3002\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528\u53e6\u4e00\u79cd\u6280\u672f\u6765\u5bf9\u6587\u6863\u8fdb\u884c\u95ee\u7b54\uff0c\u5373MapReduce\u6280\u672f\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.2 \u57fa\u4e8e MapReduce \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u5728 MapReduce \u6280\u672f\u4e2d\uff0c\u9996\u5148\u5c06\u6bcf\u4e2a\u72ec\u7acb\u7684\u6587\u6863\u5355\u72ec\u53d1\u9001\u5230\u8bed\u8a00\u6a21\u578b\u4ee5\u83b7\u53d6\u539f\u59cb\u7b54\u6848\u3002\u7136\u540e\uff0c\u8fd9\u4e9b\u7b54\u6848\u901a\u8fc7\u6700\u7ec8\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u4e00\u6b21\u8c03\u7528\u7ec4\u5408\u6210\u6700\u7ec8\u7684\u7b54\u6848\u3002\u867d\u7136\u8fd9\u6837\u6d89\u53ca\u4e86\u66f4\u591a\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u8c03\u7528\uff0c\u4f46\u5b83\u7684\u4f18\u52bf\u5728\u4e8e\u53ef\u4ee5\u5904\u7406\u4efb\u610f\u6570\u91cf\u7684\u6587\u6863\u3002\n"]}, {"cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"map_reduce\"\n", ")"]}, {"cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": ["question = \"Is probability a class topic?\"\n", "result = qa_chain_mr({\"query\": question})"]}, {"cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [{"data": {"text/plain": ["'It is not clear from the given portion of the document whether probability is a class topic or not. The text only mentions that familiarity with basic probability and statistics is assumed as a prerequisite for the class.'"]}, "execution_count": 53, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u6839\u636e\u7ed9\u51fa\u7684\u6587\u4ef6\u90e8\u5206\uff0c\u6ca1\u6709\u63d0\u5230\u6982\u7387\u8bba\u3002'"]}, "execution_count": 55, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"map_reduce\"\n", ")\n", "# \u4e2d\u6587\u7248\n", "question = \"\u6982\u7387\u8bba\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u5f53\u6211\u4eec\u5c06\u4e4b\u524d\u7684\u95ee\u9898\u901a\u8fc7\u8fd9\u4e2a\u94fe\u8fdb\u884c\u8fd0\u884c\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u8fd9\u79cd\u65b9\u6cd5\u7684\u4e24\u4e2a\u95ee\u9898\u3002\u7b2c\u4e00\uff0c\u901f\u5ea6\u8981\u6162\u5f97\u591a\u3002\u7b2c\u4e8c\uff0c\u7ed3\u679c\u5b9e\u9645\u4e0a\u66f4\u5dee\u3002\u6839\u636e\u7ed9\u5b9a\u6587\u6863\u7684\u8fd9\u4e00\u90e8\u5206\uff0c\u5bf9\u8fd9\u4e2a\u95ee\u9898\u5e76\u6ca1\u6709\u660e\u786e\u7684\u7b54\u6848\u3002\u8fd9\u53ef\u80fd\u662f\u56e0\u4e3a\u5b83\u662f\u57fa\u4e8e\u6bcf\u4e2a\u6587\u6863\u5355\u72ec\u56de\u7b54\u7684\u3002\u56e0\u6b64\uff0c\u5982\u679c\u4fe1\u606f\u5206\u5e03\u5728\u4e24\u4e2a\u6587\u6863\u4e4b\u95f4\uff0c\u5b83\u5e76\u6ca1\u6709\u5728\u540c\u4e00\u4e0a\u4e0b\u6587\u4e2d\u83b7\u53d6\u5230\u6240\u6709\u7684\u4fe1\u606f\u3002"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["#import os\n", "#os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", "#os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"\n", "#os.environ[\"LANGCHAIN_API_KEY\"] = \"...\" # replace dots with your api key"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u5bfc\u5165\u4e0a\u8ff0\u73af\u5883\u53d8\u91cf\uff0c\u7136\u540e\u63a2\u5bfb MapReduce \u6587\u6863\u94fe\u7684\u7ec6\u8282\u3002\u4f8b\u5982\uff0c\u4e0a\u8ff0\u6f14\u793a\u4e2d\uff0c\u6211\u4eec\u5b9e\u9645\u4e0a\u6d89\u53ca\u4e86\u56db\u4e2a\u5355\u72ec\u7684\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u8c03\u7528\u3002\u5728\u8fd0\u884c\u5b8c\u6bcf\u4e2a\u6587\u6863\u540e\uff0c\u5b83\u4eec\u4f1a\u5728\u6700\u7ec8\u94fe\u5f0f\u4e2d\u7ec4\u5408\u5728\u4e00\u8d77\uff0c\u5373Stuffed Documents\u94fe\uff0c\u5c06\u6240\u6709\u8fd9\u4e9b\u56de\u7b54\u5408\u5e76\u5230\u6700\u7ec8\u7684\u8c03\u7528\u4e2d\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.3 \u57fa\u4e8e Refine \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u7c7b\u4f3c\u5730\u8bbe\u7f6e\u94fe\u5f0f\u7c7b\u578b\u4e3aRefine\u3002\u8fd9\u662f\u4e00\u79cd\u65b0\u7684\u94fe\u5f0f\u7c7b\u578b\u3002Refine \u6587\u6863\u94fe\u7c7b\u4f3c\u4e8e MapReduce \u94fe\uff0c\u5bf9\u4e8e\u6bcf\u4e00\u4e2a\u6587\u6863\uff0c\u4f1a\u8c03\u7528\u4e00\u6b21 LLM\uff0c\u4f46\u6709\u6240\u6539\u8fdb\u7684\u662f\uff0c\u6211\u4eec\u6bcf\u6b21\u53d1\u9001\u7ed9 LLM \u7684\u6700\u7ec8\u63d0\u793a\u662f\u4e00\u4e2a\u5e8f\u5217\uff0c\u8fd9\u4e2a\u5e8f\u5217\u4f1a\u5c06\u5148\u524d\u7684\u54cd\u5e94\u4e0e\u65b0\u6570\u636e\u7ed3\u5408\u5728\u4e00\u8d77\uff0c\u5e76\u8bf7\u6c42\u5f97\u5230\u6539\u8fdb\u540e\u7684\u54cd\u5e94\u3002\u56e0\u6b64\uff0c\u8fd9\u662f\u4e00\u79cd\u7c7b\u4f3c\u4e8e RNN \u7684\u6982\u5ff5\uff0c\u6211\u4eec\u589e\u5f3a\u4e86\u4e0a\u4e0b\u6587\uff0c\u4ece\u800c\u89e3\u51b3\u4fe1\u606f\u5206\u5e03\u5728\u4e0d\u540c\u6587\u6863\u7684\u95ee\u9898\u3002"]}, {"cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Based on the additional context provided, probability is assumed to be a prerequisite and not a main topic of the class. The instructor assumes that students are familiar with basic probability and statistics, including random variables, expectation, variance, and basic linear algebra. The class will not be very programming-intensive, but some programming will be done in MATLAB or Octave. The instructor will provide a refresher course on the prerequisites in some of the discussion sections. The class also assumes familiarity with basic linear algebra, including matrices, vectors, matrix multiplication, and matrix inverse. Most undergraduate linear algebra courses, such as Math 51, 103, Math 113, or CS205 at Stanford, are more than enough. The instructor will also review the prerequisites in some of the discussion sections.'"]}, "execution_count": 56, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"refine\"\n", ")\n", "question = \"Is probability a class topic?\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Based on the additional context provided, the instructor mentions that they will cover statistics and algebra in the discussion sections as a refresher, and will also use the discussion sections to go over extensions of the material taught in the main lectures. However, there is no explicit mention of probability theory being covered in the course. Therefore, the original answer still stands.'"]}, "execution_count": 57, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"refine\"\n", ")\n", "question = \"\u6982\u7387\u8bba\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u4f60\u4f1a\u6ce8\u610f\u5230\uff0c\u8fd9\u4e2a\u7ed3\u679c\u6bd4MapReduce\u94fe\u7684\u7ed3\u679c\u8981\u597d\u3002\u8fd9\u662f\u56e0\u4e3a\u4f7f\u7528Refined Chain\u5141\u8bb8\u4f60\u9010\u4e2a\u5730\u7ec4\u5408\u4fe1\u606f\uff0c\u5b9e\u9645\u4e0a\u6bd4MapReduce\u94fe\u9f13\u52b1\u66f4\u591a\u7684\u4fe1\u606f\u4f20\u9012\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u5728\u8fd9\u91cc\u505a\u4e00\u4e2a\u5b9e\u9a8c\u3002\n", "\n", "\u6211\u4eec\u5c06\u521b\u5efa\u4e00\u4e2aQA\u94fe\uff0c\u4f7f\u7528\u9ed8\u8ba4\u7684stuff\u3002\u8ba9\u6211\u4eec\u95ee\u4e00\u4e2a\u95ee\u9898\uff0c\u6982\u7387\u8bba\u662f\u8bfe\u7a0b\u7684\u4e3b\u9898\u5417\uff1f\u5b83\u4f1a\u56de\u7b54\uff0c\u6982\u7387\u8bba\u5e94\u8be5\u662f\u5148\u51b3\u6761\u4ef6\u3002\u6211\u4eec\u5c06\u8ffd\u95ee\uff0c\u4e3a\u4ec0\u4e48\u9700\u8981\u8fd9\u4e9b\u5148\u51b3\u6761\u4ef6\uff1f\u7136\u540e\u6211\u4eec\u5f97\u5230\u4e86\u4e00\u4e2a\u7b54\u6848\u3002\u8fd9\u95e8\u8bfe\u7684\u5148\u51b3\u6761\u4ef6\u662f\u5047\u5b9a\u5177\u6709\u8ba1\u7b97\u673a\u79d1\u5b66\u548c\u57fa\u672c\u8ba1\u7b97\u673a\u6280\u80fd\u548c\u539f\u7406\u7684\u57fa\u672c\u77e5\u8bc6\u3002\u8fd9\u4e0e\u4e4b\u524d\u95ee\u6709\u5173\u6982\u7387\u7684\u95ee\u9898\u6beb\u4e0d\u76f8\u5173\u3002"]}, {"cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": ["qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Yes, probability is a topic in this class. The speaker assumes that students have familiarity with basic probability and statistics, and mentions that most undergraduate statistics classes will be more than enough preparation for this class.'"]}, "execution_count": 59, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"Is probability a class topic?\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [{"data": {"text/plain": ["'The prerequisites are needed because in this class, the instructor assumes that all students have a basic knowledge of computer science and knowledge of basic computer skills and principles. This includes understanding of big-O notation and other fundamental concepts. Without this basic knowledge, it may be difficult to understand the material covered in the class.'"]}, "execution_count": 60, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"why are those prerequesites needed?\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u662f\u7684\uff0c\u4f5c\u8005\u5728\u6587\u4e2d\u63d0\u5230\u4e86\u8fd9\u95e8\u8bfe\u7a0b\u9700\u8981\u5b66\u751f\u5177\u5907\u57fa\u672c\u7684\u6982\u7387\u8bba\u548c\u7edf\u8ba1\u5b66\u77e5\u8bc6\u3002'"]}, "execution_count": 62, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u6982\u7387\u8bba\u662f\u8fd9\u8282\u8bfe\u7684\u4e00\u4e2a\u5185\u5bb9\u5417\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u5728\u8fd9\u6bb5\u4e0a\u4e0b\u6587\u4e2d\uff0c\u4f5c\u8005\u63d0\u5230\u8fd9\u4e9b\u77e5\u8bc6\u662f\u8fd9\u95e8\u8bfe\u7a0b\u7684\u5148\u51b3\u6761\u4ef6\uff0c\u56e0\u4e3a\u8fd9\u95e8\u8bfe\u7a0b\u6d89\u53ca\u5230\u673a\u5668\u5b66\u4e60\u7684\u57fa\u672c\u6982\u5ff5\u548c\u7b97\u6cd5\uff0c\u9700\u8981\u5b66\u751f\u5177\u5907\u8ba1\u7b97\u673a\u79d1\u5b66\u548c\u57fa\u672c\u8ba1\u7b97\u673a\u6280\u80fd\u548c\u539f\u7406\u7684\u57fa\u672c\u77e5\u8bc6\u3002\u5982\u679c\u5b66\u751f\u6ca1\u6709\u8fd9\u4e9b\u57fa\u7840\u77e5\u8bc6\uff0c\u53ef\u80fd\u4f1a\u5f88\u96be\u7406\u89e3\u548c\u5e94\u7528\u673a\u5668\u5b66\u4e60\u7b97\u6cd5\u3002\u56e0\u6b64\uff0c\u5b66\u751f\u9700\u8981\u5177\u5907\u8fd9\u4e9b\u77e5\u8bc6\u624d\u80fd\u66f4\u597d\u5730\u5b66\u4e60\u548c\u5e94\u7528\u673a\u5668\u5b66\u4e60\u3002'"]}, "execution_count": 63, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u4e3a\u4ec0\u4e48\u9700\u8981\u5177\u5907\u8fd9\u4e9b\u77e5\u8bc6\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57fa\u672c\u4e0a\uff0c\u6211\u4eec\u4f7f\u7528\u7684\u94fe\u5f0f\uff08chain\uff09\u6ca1\u6709\u4efb\u4f55\u72b6\u6001\u7684\u6982\u5ff5\u3002\u5b83\u4e0d\u8bb0\u5f97\u4e4b\u524d\u7684\u95ee\u9898\u6216\u4e4b\u524d\u7684\u7b54\u6848\u3002\u4e3a\u4e86\u5b9e\u73b0\u8fd9\u4e00\u70b9\uff0c\u6211\u4eec\u9700\u8981\u5f15\u5165\u5185\u5b58\uff0c\u8fd9\u662f\u6211\u4eec\u5c06\u5728\u4e0b\u4e00\u8282\u4e2d\u8ba8\u8bba\u7684\u5185\u5bb9\u3002"]}], "metadata": {"kernelspec": {"display_name": "gpt", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11"}, "orig_nbformat": 4}, "nbformat": 4, "nbformat_minor": 2} |