Notebook uploaded
This commit is contained in:
@ -0,0 +1,263 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4f0923c7-3ebb-4809-8015-0aff4a04569c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Text normalization\n",
|
||||
"* To relate variations of words to a common form/root, e.g.\n",
|
||||
"\n",
|
||||
" run, runs, running -> run<br>\n",
|
||||
" am, are, is -> be\n",
|
||||
"* Includes both stemming and lemmatization\n",
|
||||
"* Need to consider both inflectional forms and derivational forms"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "3135d8e9-c434-4fb1-b1dc-b51e3077f60a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import nltk"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "ddc9f631-2624-456b-8c40-857bcc9a41a0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[nltk_data] Downloading package wordnet to C:\\nltk_data...\n",
|
||||
"[nltk_data] Package wordnet is already up-to-date!\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
|
||||
"from nltk import wordnet\n",
|
||||
"nltk.download('wordnet')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "52346f65-74e0-4b2f-87bb-d0284ea3f99b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"stemmer = PorterStemmer()\n",
|
||||
"wnl = WordNetLemmatizer()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "f57a43a1-368f-4642-bb2f-1241d300e608",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"word = 'meeting'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "a33c32b9-4ad2-4e50-bd1c-23d5707f5644",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"('meet', 'meeting')"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"stemmer.stem(word), wnl.lemmatize(word)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "4f199f56-1dea-4204-8f4f-2ec3cf8e4937",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"words1 = ['I', 'am', 'going', 'to', 'a', 'meeting', 'about', 'data', 'science']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "ab5e6fef-7968-496a-9624-f9f9437e04e7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"words2 = ['dogs', 'churches', 'aardwolves', 'abaci']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "80a461c8-4235-4ddd-9629-c2b717d10319",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['dog', 'church', 'aardwolv', 'abaci']"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"[stemmer.stem(word) for word in words2]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "e592c511-6121-4af7-92a3-81199e760276",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['dog', 'church', 'aardwolf', 'abacus']"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"[wnl.lemmatize(word) for word in words2]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8e1509a2-2ad4-4b82-a5c4-cb79527ec8f5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"lemmatize did a better job with the made up word"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "50936259-fd59-4b32-9ef6-28dbc011e044",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['i', 'am', 'go', 'to', 'a', 'meet', 'about', 'data', 'scienc']"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"[stemmer.stem(word) for word in words1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "e138f7be-80dd-44ae-9777-6effe1f4a12e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['I', 'am', 'going', 'to', 'a', 'meeting', 'about', 'data', 'science']"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"[wnl.lemmatize(word) for word in words1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fcd69584-c622-430d-bd44-eab4342dc064",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Lemmatization is done in linguistic sensitive contexts, below we'll set it to treat terms as verbs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "3ac7b59d-39d4-41ee-a8b8-a7ae284760af",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['I', 'be', 'go', 'to', 'a', 'meet', 'about', 'data', 'science']"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"[wnl.lemmatize(word, pos='v') for word in words1]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user