Notebook uploaded

2023-01-12 18:50:39 -05:00
parent 2243eaf0d5
commit 908e48b87c
1 changed files with 202 additions and 0 deletions
--- a/tokenization.ipynb
+++ b/tokenization.ipynb
@ -0,0 +1,202 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "51825c1c-1a87-4ef6-b332-ea8c7c785d1b",
+   "metadata": {},
+   "source": [
+    "# Word tokenization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "862d8df2-59d2-4cab-b2c8-e3e0390476bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e56f638f-39a0-402b-832a-09232552748c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = '''Most of the time we can use white space.\n",
+    "But what about fred@gmail.com or 13/01/2021?\n",
+    "The students' attempts aren't working.\n",
+    "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "71fa1ba1-e9c4-4b2d-a99d-f55d877bafd4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most of the time we can use white space.\n",
+      "---\n",
+      "But what about fred@gmail.com or 13/01/2021?\n",
+      "---\n",
+      "The students' attempts aren't working.\n",
+      "---\n",
+      "Maybe it's the use of apostrophes?\n",
+      "---\n",
+      "Or it might need a more up-to-date model.\n"
+     ]
+    }
+   ],
+   "source": [
+    "sents = nltk.sent_tokenize(text)\n",
+    "print('\\n---\\n'.join(sents))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "34c811f0-78fd-4c9c-b608-ea72ce5f2ff8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n",
+       " ['But', 'what', 'about', 'fred', '@', 'gmail.com', 'or', '13/01/2021', '?'],\n",
+       " ['The', 'students', \"'\", 'attempts', 'are', \"n't\", 'working', '.'],\n",
+       " ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n",
+       " ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "[nltk.word_tokenize(s) for s in sents]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3b4d148d-8a72-4c14-9627-45d4797d92c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = '''Most of the time we can use white space.\n",
+    "But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n",
+    "Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n",
+    "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9298755b-2492-4952-8bd5-1d327dcc16b7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most of the time we can use white space.\n",
+      "---\n",
+      "But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n",
+      "---\n",
+      "Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n",
+      "---\n",
+      "Maybe it's the use of apostrophes?\n",
+      "---\n",
+      "Or it might need a more up-to-date model.\n"
+     ]
+    }
+   ],
+   "source": [
+    "sents = nltk.sent_tokenize(text)\n",
+    "print('\\n---\\n'.join(sents))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "89c94dd1-0f5c-404d-adc3-f23b2516a530",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n",
+       " ['But',\n",
+       "  'what',\n",
+       "  'about',\n",
+       "  'http',\n",
+       "  ':',\n",
+       "  '//www.nltk.org/',\n",
+       "  ',',\n",
+       "  'Hewlett-Packard',\n",
+       "  'or',\n",
+       "  'I.B.M',\n",
+       "  '.',\n",
+       "  '?'],\n",
+       " ['Prof.',\n",
+       "  'Russell-Rose',\n",
+       "  'thinks',\n",
+       "  'that',\n",
+       "  'the',\n",
+       "  'students',\n",
+       "  \"'\",\n",
+       "  'attempts',\n",
+       "  'to',\n",
+       "  'use',\n",
+       "  'nltk',\n",
+       "  \"'s\",\n",
+       "  'tokenizer',\n",
+       "  'are',\n",
+       "  \"n't\",\n",
+       "  'working',\n",
+       "  '.'],\n",
+       " ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n",
+       " ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "[nltk.word_tokenize(s) for s in sents]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}