diff --git a/CM3060 Natural Language Processing/Week 3/2.2.5 Text normalization.ipynb b/CM3060 Natural Language Processing/Week 3/2.2.5 Text normalization.ipynb new file mode 100644 index 0000000..aa50cc9 --- /dev/null +++ b/CM3060 Natural Language Processing/Week 3/2.2.5 Text normalization.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4f0923c7-3ebb-4809-8015-0aff4a04569c", + "metadata": {}, + "source": [ + "# Text normalization\n", + "* To relate variations of words to a common form/root, e.g.\n", + "\n", + " run, runs, running -> run
\n", + " am, are, is -> be\n", + "* Includes both stemming and lemmatization\n", + "* Need to consider both inflectional forms and derivational forms" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3135d8e9-c434-4fb1-b1dc-b51e3077f60a", + "metadata": {}, + "outputs": [], + "source": [ + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ddc9f631-2624-456b-8c40-857bcc9a41a0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to C:\\nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from nltk.stem import PorterStemmer, WordNetLemmatizer\n", + "from nltk import wordnet\n", + "nltk.download('wordnet')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "52346f65-74e0-4b2f-87bb-d0284ea3f99b", + "metadata": {}, + "outputs": [], + "source": [ + "stemmer = PorterStemmer()\n", + "wnl = WordNetLemmatizer()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f57a43a1-368f-4642-bb2f-1241d300e608", + "metadata": {}, + "outputs": [], + "source": [ + "word = 'meeting'" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a33c32b9-4ad2-4e50-bd1c-23d5707f5644", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('meet', 'meeting')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stemmer.stem(word), wnl.lemmatize(word)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4f199f56-1dea-4204-8f4f-2ec3cf8e4937", + "metadata": {}, + "outputs": [], + "source": [ + "words1 = ['I', 'am', 'going', 'to', 'a', 'meeting', 'about', 'data', 'science']" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ab5e6fef-7968-496a-9624-f9f9437e04e7", + "metadata": {}, + "outputs": [], + "source": [ + "words2 = ['dogs', 'churches', 'aardwolves', 'abaci']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "80a461c8-4235-4ddd-9629-c2b717d10319", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['dog', 'church', 'aardwolv', 'abaci']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[stemmer.stem(word) for word in words2]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e592c511-6121-4af7-92a3-81199e760276", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['dog', 'church', 'aardwolf', 'abacus']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[wnl.lemmatize(word) for word in words2]" + ] + }, + { + "cell_type": "markdown", + "id": "8e1509a2-2ad4-4b82-a5c4-cb79527ec8f5", + "metadata": {}, + "source": [ + "lemmatize did a better job with the made up word" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "50936259-fd59-4b32-9ef6-28dbc011e044", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['i', 'am', 'go', 'to', 'a', 'meet', 'about', 'data', 'scienc']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[stemmer.stem(word) for word in words1]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e138f7be-80dd-44ae-9777-6effe1f4a12e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['I', 'am', 'going', 'to', 'a', 'meeting', 'about', 'data', 'science']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[wnl.lemmatize(word) for word in words1]" + ] + }, + { + "cell_type": "markdown", + "id": "fcd69584-c622-430d-bd44-eab4342dc064", + "metadata": {}, + "source": [ + "Lemmatization is done in linguistic sensitive contexts, below we'll set it to treat terms as verbs" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3ac7b59d-39d4-41ee-a8b8-a7ae284760af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['I', 'be', 'go', 'to', 'a', 'meet', 'about', 'data', 'science']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[wnl.lemmatize(word, pos='v') for word in words1]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}