diff --git a/CM3060 Natural Language Processing/Week 3/2.2.3 Word tokenization.ipynb b/CM3060 Natural Language Processing/Week 3/2.2.3 Word tokenization.ipynb new file mode 100644 index 0000000..da44533 --- /dev/null +++ b/CM3060 Natural Language Processing/Week 3/2.2.3 Word tokenization.ipynb @@ -0,0 +1,202 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "51825c1c-1a87-4ef6-b332-ea8c7c785d1b", + "metadata": {}, + "source": [ + "# Word tokenization" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "862d8df2-59d2-4cab-b2c8-e3e0390476bf", + "metadata": {}, + "outputs": [], + "source": [ + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e56f638f-39a0-402b-832a-09232552748c", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''Most of the time we can use white space.\n", + "But what about fred@gmail.com or 13/01/2021?\n", + "The students' attempts aren't working.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "71fa1ba1-e9c4-4b2d-a99d-f55d877bafd4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most of the time we can use white space.\n", + "---\n", + "But what about fred@gmail.com or 13/01/2021?\n", + "---\n", + "The students' attempts aren't working.\n", + "---\n", + "Maybe it's the use of apostrophes?\n", + "---\n", + "Or it might need a more up-to-date model.\n" + ] + } + ], + "source": [ + "sents = nltk.sent_tokenize(text)\n", + "print('\\n---\\n'.join(sents))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "34c811f0-78fd-4c9c-b608-ea72ce5f2ff8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n", + " ['But', 'what', 'about', 'fred', '@', 'gmail.com', 'or', '13/01/2021', '?'],\n", + " ['The', 'students', \"'\", 'attempts', 'are', \"n't\", 'working', '.'],\n", + " ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n", + " ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[nltk.word_tokenize(s) for s in sents]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3b4d148d-8a72-4c14-9627-45d4797d92c9", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''Most of the time we can use white space.\n", + "But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n", + "Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9298755b-2492-4952-8bd5-1d327dcc16b7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most of the time we can use white space.\n", + "---\n", + "But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n", + "---\n", + "Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n", + "---\n", + "Maybe it's the use of apostrophes?\n", + "---\n", + "Or it might need a more up-to-date model.\n" + ] + } + ], + "source": [ + "sents = nltk.sent_tokenize(text)\n", + "print('\\n---\\n'.join(sents))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "89c94dd1-0f5c-404d-adc3-f23b2516a530", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n", + " ['But',\n", + " 'what',\n", + " 'about',\n", + " 'http',\n", + " ':',\n", + " '//www.nltk.org/',\n", + " ',',\n", + " 'Hewlett-Packard',\n", + " 'or',\n", + " 'I.B.M',\n", + " '.',\n", + " '?'],\n", + " ['Prof.',\n", + " 'Russell-Rose',\n", + " 'thinks',\n", + " 'that',\n", + " 'the',\n", + " 'students',\n", + " \"'\",\n", + " 'attempts',\n", + " 'to',\n", + " 'use',\n", + " 'nltk',\n", + " \"'s\",\n", + " 'tokenizer',\n", + " 'are',\n", + " \"n't\",\n", + " 'working',\n", + " '.'],\n", + " ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n", + " ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[nltk.word_tokenize(s) for s in sents]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}