{ "cells": [ { "cell_type": "markdown", "id": "51825c1c-1a87-4ef6-b332-ea8c7c785d1b", "metadata": {}, "source": [ "# Word tokenization" ] }, { "cell_type": "code", "execution_count": 1, "id": "862d8df2-59d2-4cab-b2c8-e3e0390476bf", "metadata": {}, "outputs": [], "source": [ "import nltk" ] }, { "cell_type": "code", "execution_count": 2, "id": "e56f638f-39a0-402b-832a-09232552748c", "metadata": {}, "outputs": [], "source": [ "text = '''Most of the time we can use white space.\n", "But what about fred@gmail.com or 13/01/2021?\n", "The students' attempts aren't working.\n", "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", "'''" ] }, { "cell_type": "code", "execution_count": 3, "id": "71fa1ba1-e9c4-4b2d-a99d-f55d877bafd4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Most of the time we can use white space.\n", "---\n", "But what about fred@gmail.com or 13/01/2021?\n", "---\n", "The students' attempts aren't working.\n", "---\n", "Maybe it's the use of apostrophes?\n", "---\n", "Or it might need a more up-to-date model.\n" ] } ], "source": [ "sents = nltk.sent_tokenize(text)\n", "print('\\n---\\n'.join(sents))" ] }, { "cell_type": "code", "execution_count": 4, "id": "34c811f0-78fd-4c9c-b608-ea72ce5f2ff8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n", " ['But', 'what', 'about', 'fred', '@', 'gmail.com', 'or', '13/01/2021', '?'],\n", " ['The', 'students', \"'\", 'attempts', 'are', \"n't\", 'working', '.'],\n", " ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n", " ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[nltk.word_tokenize(s) for s in sents]" ] }, { "cell_type": "code", "execution_count": 5, "id": "3b4d148d-8a72-4c14-9627-45d4797d92c9", "metadata": {}, "outputs": [], "source": [ "text = '''Most of the time we can use white space.\n", "But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n", "Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n", "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", "'''" ] }, { "cell_type": "code", "execution_count": 6, "id": "9298755b-2492-4952-8bd5-1d327dcc16b7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Most of the time we can use white space.\n", "---\n", "But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n", "---\n", "Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n", "---\n", "Maybe it's the use of apostrophes?\n", "---\n", "Or it might need a more up-to-date model.\n" ] } ], "source": [ "sents = nltk.sent_tokenize(text)\n", "print('\\n---\\n'.join(sents))" ] }, { "cell_type": "code", "execution_count": 7, "id": "89c94dd1-0f5c-404d-adc3-f23b2516a530", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n", " ['But',\n", " 'what',\n", " 'about',\n", " 'http',\n", " ':',\n", " '//www.nltk.org/',\n", " ',',\n", " 'Hewlett-Packard',\n", " 'or',\n", " 'I.B.M',\n", " '.',\n", " '?'],\n", " ['Prof.',\n", " 'Russell-Rose',\n", " 'thinks',\n", " 'that',\n", " 'the',\n", " 'students',\n", " \"'\",\n", " 'attempts',\n", " 'to',\n", " 'use',\n", " 'nltk',\n", " \"'s\",\n", " 'tokenizer',\n", " 'are',\n", " \"n't\",\n", " 'working',\n", " '.'],\n", " ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n", " ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[nltk.word_tokenize(s) for s in sents]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" } }, "nbformat": 4, "nbformat_minor": 5 }