{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "51825c1c-1a87-4ef6-b332-ea8c7c785d1b",
   "metadata": {},
   "source": [
    "# Word tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "862d8df2-59d2-4cab-b2c8-e3e0390476bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e56f638f-39a0-402b-832a-09232552748c",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = '''Most of the time we can use white space.\n",
    "But what about fred@gmail.com or 13/01/2021?\n",
    "The students' attempts aren't working.\n",
    "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "71fa1ba1-e9c4-4b2d-a99d-f55d877bafd4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most of the time we can use white space.\n",
      "---\n",
      "But what about fred@gmail.com or 13/01/2021?\n",
      "---\n",
      "The students' attempts aren't working.\n",
      "---\n",
      "Maybe it's the use of apostrophes?\n",
      "---\n",
      "Or it might need a more up-to-date model.\n"
     ]
    }
   ],
   "source": [
    "sents = nltk.sent_tokenize(text)\n",
    "print('\\n---\\n'.join(sents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "34c811f0-78fd-4c9c-b608-ea72ce5f2ff8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n",
       " ['But', 'what', 'about', 'fred', '@', 'gmail.com', 'or', '13/01/2021', '?'],\n",
       " ['The', 'students', \"'\", 'attempts', 'are', \"n't\", 'working', '.'],\n",
       " ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n",
       " ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[nltk.word_tokenize(s) for s in sents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3b4d148d-8a72-4c14-9627-45d4797d92c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = '''Most of the time we can use white space.\n",
    "But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n",
    "Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n",
    "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9298755b-2492-4952-8bd5-1d327dcc16b7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most of the time we can use white space.\n",
      "---\n",
      "But what about http://www.nltk.org/, Hewlett-Packard or I.B.M.?\n",
      "---\n",
      "Prof. Russell-Rose thinks that the students' attempts to use nltk's tokenizer aren't working.\n",
      "---\n",
      "Maybe it's the use of apostrophes?\n",
      "---\n",
      "Or it might need a more up-to-date model.\n"
     ]
    }
   ],
   "source": [
    "sents = nltk.sent_tokenize(text)\n",
    "print('\\n---\\n'.join(sents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "89c94dd1-0f5c-404d-adc3-f23b2516a530",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['Most', 'of', 'the', 'time', 'we', 'can', 'use', 'white', 'space', '.'],\n",
       " ['But',\n",
       "  'what',\n",
       "  'about',\n",
       "  'http',\n",
       "  ':',\n",
       "  '//www.nltk.org/',\n",
       "  ',',\n",
       "  'Hewlett-Packard',\n",
       "  'or',\n",
       "  'I.B.M',\n",
       "  '.',\n",
       "  '?'],\n",
       " ['Prof.',\n",
       "  'Russell-Rose',\n",
       "  'thinks',\n",
       "  'that',\n",
       "  'the',\n",
       "  'students',\n",
       "  \"'\",\n",
       "  'attempts',\n",
       "  'to',\n",
       "  'use',\n",
       "  'nltk',\n",
       "  \"'s\",\n",
       "  'tokenizer',\n",
       "  'are',\n",
       "  \"n't\",\n",
       "  'working',\n",
       "  '.'],\n",
       " ['Maybe', 'it', \"'s\", 'the', 'use', 'of', 'apostrophes', '?'],\n",
       " ['Or', 'it', 'might', 'need', 'a', 'more', 'up-to-date', 'model', '.']]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[nltk.word_tokenize(s) for s in sents]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}