Notebook uploaded
This commit is contained in:
@ -0,0 +1,202 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "51825c1c-1a87-4ef6-b332-ea8c7c785d1b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Word tokenization"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "862d8df2-59d2-4cab-b2c8-e3e0390476bf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import nltk"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "e56f638f-39a0-402b-832a-09232552748c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = '''Most of the time we can use white space.\n",
|
||||
"But what about fred@gmail.com or 13/01/2021?\n",
|
||||
"The students' attempts aren't working.\n",
|
||||
"Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n",
|
||||
"'''"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "71fa1ba1-e9c4-4b2d-a99d-f55d877bafd4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Most of the time we can use white space.\n",
|
||||
"---\n",
|
||||
"But what about fred@gmail.com or 13/01/2021?\n",
|
||||
"---\n",
|
||||
"The students' attempts aren't working.\n",
|
||||
"---\n",
|
||||
"Maybe it's the use of apostrophes?\n",
|
||||
"---\n",
|
||||
"Or it might need a more up-to-date model.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sents = nltk.sent_tokenize(text)\n",
|
||||
"print('\\n---\\n'.join(sents))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "34c811f0-78fd-4c9c-b608-ea72ce5f2ff8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n",
|
||||
" ['But', 'what', 'about', 'fred', '@', 'gmail.com', 'or', '13/01/2021', '?'],\n",
|
||||
" ['The', 'students', \"'\", 'attempts', 'are', \"n't\", 'working', '.'],\n",
|
||||
" ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n",
|
||||
" ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"[nltk.word_tokenize(s) for s in sents]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "3b4d148d-8a72-4c14-9627-45d4797d92c9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = '''Most of the time we can use white space.\n",
|
||||
"But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n",
|
||||
"Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n",
|
||||
"Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n",
|
||||
"'''"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "9298755b-2492-4952-8bd5-1d327dcc16b7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Most of the time we can use white space.\n",
|
||||
"---\n",
|
||||
"But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n",
|
||||
"---\n",
|
||||
"Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n",
|
||||
"---\n",
|
||||
"Maybe it's the use of apostrophes?\n",
|
||||
"---\n",
|
||||
"Or it might need a more up-to-date model.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sents = nltk.sent_tokenize(text)\n",
|
||||
"print('\\n---\\n'.join(sents))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "89c94dd1-0f5c-404d-adc3-f23b2516a530",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n",
|
||||
" ['But',\n",
|
||||
" 'what',\n",
|
||||
" 'about',\n",
|
||||
" 'http',\n",
|
||||
" ':',\n",
|
||||
" '//www.nltk.org/',\n",
|
||||
" ',',\n",
|
||||
" 'Hewlett-Packard',\n",
|
||||
" 'or',\n",
|
||||
" 'I.B.M',\n",
|
||||
" '.',\n",
|
||||
" '?'],\n",
|
||||
" ['Prof.',\n",
|
||||
" 'Russell-Rose',\n",
|
||||
" 'thinks',\n",
|
||||
" 'that',\n",
|
||||
" 'the',\n",
|
||||
" 'students',\n",
|
||||
" \"'\",\n",
|
||||
" 'attempts',\n",
|
||||
" 'to',\n",
|
||||
" 'use',\n",
|
||||
" 'nltk',\n",
|
||||
" \"'s\",\n",
|
||||
" 'tokenizer',\n",
|
||||
" 'are',\n",
|
||||
" \"n't\",\n",
|
||||
" 'working',\n",
|
||||
" '.'],\n",
|
||||
" ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n",
|
||||
" ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"[nltk.word_tokenize(s) for s in sents]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user