{ "cells": [ { "cell_type": "markdown", "id": "4f0923c7-3ebb-4809-8015-0aff4a04569c", "metadata": {}, "source": [ "# Text normalization\n", "* To relate variations of words to a common form/root, e.g.\n", "\n", " run, runs, running -> run
\n", " am, are, is -> be\n", "* Includes both stemming and lemmatization\n", "* Need to consider both inflectional forms and derivational forms" ] }, { "cell_type": "code", "execution_count": 2, "id": "3135d8e9-c434-4fb1-b1dc-b51e3077f60a", "metadata": {}, "outputs": [], "source": [ "import nltk" ] }, { "cell_type": "code", "execution_count": 3, "id": "ddc9f631-2624-456b-8c40-857bcc9a41a0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to C:\\nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from nltk.stem import PorterStemmer, WordNetLemmatizer\n", "from nltk import wordnet\n", "nltk.download('wordnet')" ] }, { "cell_type": "code", "execution_count": 5, "id": "52346f65-74e0-4b2f-87bb-d0284ea3f99b", "metadata": {}, "outputs": [], "source": [ "stemmer = PorterStemmer()\n", "wnl = WordNetLemmatizer()" ] }, { "cell_type": "code", "execution_count": 6, "id": "f57a43a1-368f-4642-bb2f-1241d300e608", "metadata": {}, "outputs": [], "source": [ "word = 'meeting'" ] }, { "cell_type": "code", "execution_count": 7, "id": "a33c32b9-4ad2-4e50-bd1c-23d5707f5644", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('meet', 'meeting')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stemmer.stem(word), wnl.lemmatize(word)" ] }, { "cell_type": "code", "execution_count": 8, "id": "4f199f56-1dea-4204-8f4f-2ec3cf8e4937", "metadata": {}, "outputs": [], "source": [ "words1 = ['I', 'am', 'going', 'to', 'a', 'meeting', 'about', 'data', 'science']" ] }, { "cell_type": "code", "execution_count": 9, "id": "ab5e6fef-7968-496a-9624-f9f9437e04e7", "metadata": {}, "outputs": [], "source": [ "words2 = ['dogs', 'churches', 'aardwolves', 'abaci']" ] }, { "cell_type": "code", "execution_count": 10, "id": "80a461c8-4235-4ddd-9629-c2b717d10319", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['dog', 'church', 'aardwolv', 'abaci']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[stemmer.stem(word) for word in words2]" ] }, { "cell_type": "code", "execution_count": 12, "id": "e592c511-6121-4af7-92a3-81199e760276", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['dog', 'church', 'aardwolf', 'abacus']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[wnl.lemmatize(word) for word in words2]" ] }, { "cell_type": "markdown", "id": "8e1509a2-2ad4-4b82-a5c4-cb79527ec8f5", "metadata": {}, "source": [ "lemmatize did a better job with the made up word" ] }, { "cell_type": "code", "execution_count": 14, "id": "50936259-fd59-4b32-9ef6-28dbc011e044", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['i', 'am', 'go', 'to', 'a', 'meet', 'about', 'data', 'scienc']" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[stemmer.stem(word) for word in words1]" ] }, { "cell_type": "code", "execution_count": 15, "id": "e138f7be-80dd-44ae-9777-6effe1f4a12e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['I', 'am', 'going', 'to', 'a', 'meeting', 'about', 'data', 'science']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[wnl.lemmatize(word) for word in words1]" ] }, { "cell_type": "markdown", "id": "fcd69584-c622-430d-bd44-eab4342dc064", "metadata": {}, "source": [ "Lemmatization is done in linguistic sensitive contexts, below we'll set it to treat terms as verbs" ] }, { "cell_type": "code", "execution_count": 16, "id": "3ac7b59d-39d4-41ee-a8b8-a7ae284760af", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['I', 'be', 'go', 'to', 'a', 'meet', 'about', 'data', 'science']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[wnl.lemmatize(word, pos='v') for word in words1]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" } }, "nbformat": 4, "nbformat_minor": 5 }