Notebook uploaded

This commit is contained in:
levdoescode
2023-01-12 18:50:39 -05:00
parent 2243eaf0d5
commit 908e48b87c

View File

@ -0,0 +1,202 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "51825c1c-1a87-4ef6-b332-ea8c7c785d1b",
"metadata": {},
"source": [
"# Word tokenization"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "862d8df2-59d2-4cab-b2c8-e3e0390476bf",
"metadata": {},
"outputs": [],
"source": [
"import nltk"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e56f638f-39a0-402b-832a-09232552748c",
"metadata": {},
"outputs": [],
"source": [
"text = '''Most of the time we can use white space.\n",
"But what about fred@gmail.com or 13/01/2021?\n",
"The students' attempts aren't working.\n",
"Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "71fa1ba1-e9c4-4b2d-a99d-f55d877bafd4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Most of the time we can use white space.\n",
"---\n",
"But what about fred@gmail.com or 13/01/2021?\n",
"---\n",
"The students' attempts aren't working.\n",
"---\n",
"Maybe it's the use of apostrophes?\n",
"---\n",
"Or it might need a more up-to-date model.\n"
]
}
],
"source": [
"sents = nltk.sent_tokenize(text)\n",
"print('\\n---\\n'.join(sents))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "34c811f0-78fd-4c9c-b608-ea72ce5f2ff8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n",
" ['But', 'what', 'about', 'fred', '@', 'gmail.com', 'or', '13/01/2021', '?'],\n",
" ['The', 'students', \"'\", 'attempts', 'are', \"n't\", 'working', '.'],\n",
" ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n",
" ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[nltk.word_tokenize(s) for s in sents]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3b4d148d-8a72-4c14-9627-45d4797d92c9",
"metadata": {},
"outputs": [],
"source": [
"text = '''Most of the time we can use white space.\n",
"But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n",
"Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n",
"Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9298755b-2492-4952-8bd5-1d327dcc16b7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Most of the time we can use white space.\n",
"---\n",
"But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n",
"---\n",
"Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n",
"---\n",
"Maybe it's the use of apostrophes?\n",
"---\n",
"Or it might need a more up-to-date model.\n"
]
}
],
"source": [
"sents = nltk.sent_tokenize(text)\n",
"print('\\n---\\n'.join(sents))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "89c94dd1-0f5c-404d-adc3-f23b2516a530",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n",
" ['But',\n",
" 'what',\n",
" 'about',\n",
" 'http',\n",
" ':',\n",
" '//www.nltk.org/',\n",
" ',',\n",
" 'Hewlett-Packard',\n",
" 'or',\n",
" 'I.B.M',\n",
" '.',\n",
" '?'],\n",
" ['Prof.',\n",
" 'Russell-Rose',\n",
" 'thinks',\n",
" 'that',\n",
" 'the',\n",
" 'students',\n",
" \"'\",\n",
" 'attempts',\n",
" 'to',\n",
" 'use',\n",
" 'nltk',\n",
" \"'s\",\n",
" 'tokenizer',\n",
" 'are',\n",
" \"n't\",\n",
" 'working',\n",
" '.'],\n",
" ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n",
" ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[nltk.word_tokenize(s) for s in sents]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}