From 46a8ff7bdc605a77035fce582625d245fac5cd7f Mon Sep 17 00:00:00 2001 From: levdoescode Date: Sat, 14 Jan 2023 08:07:06 -0500 Subject: [PATCH] Conditonal formating notebook completed --- ... Conditional frequency distributions.ipynb | 266 ++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 CM3060 Natural Language Processing/Week 5/3.1.9 Conditional frequency distributions.ipynb diff --git a/CM3060 Natural Language Processing/Week 5/3.1.9 Conditional frequency distributions.ipynb b/CM3060 Natural Language Processing/Week 5/3.1.9 Conditional frequency distributions.ipynb new file mode 100644 index 0000000..d2acd3e --- /dev/null +++ b/CM3060 Natural Language Processing/Week 5/3.1.9 Conditional frequency distributions.ipynb @@ -0,0 +1,266 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conditional frequency distributions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Using simple bigrams" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1 Download the Brown corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package brown to C:\\nltk_data...\n", + "[nltk_data] Package brown is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nltk\n", + "from nltk.corpus import brown\n", + "nltk.download('brown')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 Create a bigram model" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "text = brown.words(categories='news')\n", + "bigrams = nltk.bigrams(text)\n", + "cfd = nltk.ConditionalFreqDist(bigrams)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_text(cfdist, word, num=50):\n", + " for i in range(num):\n", + " print(word, end=' ')\n", + " word = cfdist[word].max()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3 Test it\n", + "Use a variety of different words!" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pass enabling legislation to the first time . The President Kennedy , and the first time . The President Kennedy , and the first time . The President Kennedy , and the first time . The President Kennedy , and the first time . The President Kennedy , and the " + ] + } + ], + "source": [ + "# here is just one example, try some others yourself\n", + "generate_text(cfd, 'pass')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Make it more generative\n", + "Pick the next word at random from the list of bigrams" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Add a parameter for the number of bigrams to consider\n", + "2. Assign the bigrams for the current word to a frequency distribution\n", + "3. Create a list of the top N bigrams\n", + "4. Pick one at random and assign to the variable 'word'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# fill in the blanks marked X\n", + "import random\n", + "def generate_text(cfdist, word, num=100, n=2): #1\n", + " for i in range(num):\n", + " print(word, end = ' ')\n", + " fdist = cfdist[word] #2\n", + " words = list(fdist.keys())[:n] #3\n", + " word = random.choice(words) #4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Reveal the solution (only if you get stuck!)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# solution\n", + "import random\n", + "def generate_text(cfdist, word, num=100, n=2): \n", + " for i in range(num):\n", + " print(word, end=' ')\n", + " fdist = cfdist[word]\n", + " words = list(fdist.keys())[:n]\n", + " word = random.choice(words)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Test your solution using different values of N" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "He told The jury further said in term-end presentments that any irregularities took a relative merits of Atlanta's recent primary election , which had over-all charge Jan. 1 the City of Atlanta's recent primary election , `` deserves a number of Atlanta's new multi-million-dollar airport be combined to have these laws `` no evidence `` deserves a relative merits awe . The Fulton County Grand Jury indictments with city personnel as `` no evidence `` no -- and thanks of the election , `` no -- and thanks of Atlanta's new multi-million-dollar airport , which had been charged mental cruelty " + ] + } + ], + "source": [ + "generate_text(cfd, 'He', n=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "He told The Fulton legislators `` deserves closer and election was received 1,119 votes on a result , `` irregularities '' for its appointed temporary assistant more than three years of the City of Atlanta and election , the City of Atlanta's recent years . The Fulton legislators allotted to have these funds through its appointed and the praise to the City Purchasing Department , which was received and election was conducted . `` deserves the election produced the City of the City Purchasing Department . The jury had been agreed to have a number of Atlanta's recent years of " + ] + } + ], + "source": [ + "generate_text(cfd, 'He', n=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "He has been charged the manner in the praise to have a relative merits of possible revisions in the City Purchasing Department . It recommended federal legislation . `` deserves the election was won by Fulton County purchasing and often ambiguous '' in which it said Friday in which it believes `` no word to achieve this problem '' in term-end presentments that the praise to investigate dog . `` deserves a swipe at which was received and often hear a relative merits of Atlanta's Morehouse ( Red Sox today proposed Thursday against racial discrimination in which was conducted by " + ] + } + ], + "source": [ + "generate_text(cfd, 'He', n=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Discussion:\n", + "Why does the original version get stuck in a loop so easily? \n", + "How does introducing some randomness solve this problem? \n", + "What effect does increasing N (the number of bigrams to consider) have? Does it make the text more or less intelligible? If so, why?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}