diff --git a/CM3060 Natural Language Processing/Week 3/2.2.1 Sentence segmentation.ipynb b/CM3060 Natural Language Processing/Week 3/2.2.1 Sentence segmentation.ipynb new file mode 100644 index 0000000..b373181 --- /dev/null +++ b/CM3060 Natural Language Processing/Week 3/2.2.1 Sentence segmentation.ipynb @@ -0,0 +1,242 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1878fa45-3b4a-44f5-b5fa-97c00adb4593", + "metadata": {}, + "source": [ + "# Sentence segmentation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "aff71edc-bb8f-49dc-a4fd-bbac1a74ebed", + "metadata": {}, + "outputs": [], + "source": [ + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3708c40d-cb73-4a6e-96ad-6d6e70c862ff", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to C:\\nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nltk.download('punkt')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bd55a5f7-ff62-47b8-928b-d5b60b92e9f2", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''Punctuation denotes the end of a sentence!\n", + "“But not always!”, said Fred... (who is a professor).\n", + "Some sentences could break over multiple lines\n", + "and begin with a lower case letter.\n", + "a1 could be a variable name.\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5fd18291-2765-4841-9cdf-4efbe110bded", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Punctuation denotes the end of a sentence!\\n“But not always!”, said Fred... (who is a professor).\\nSome sentences could break over multiple lines\\nand begin with a lower case letter.\\na1 could be a variable name.\\n'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2a010ec4-c5ad-4429-be04-7f3626d3a6e2", + "metadata": {}, + "outputs": [], + "source": [ + "sents = nltk.sent_tokenize(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "74e585c9-a137-4008-824d-699dcb470943", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'“But not always!”, said Fred... (who is a professor).'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sents[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "815c6f21-4c1f-491a-96cf-b6ba6878eb67", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sents)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6c895aea-7fc0-47f0-a9ca-6710735da51a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Punctuation denotes the end of a sentence!\n", + "“But not always!”, said Fred... (who is a professor).\n", + "Some sentences could break over multiple lines\n", + "and begin with a lower case letter.\n", + "a1 could be a variable name.\n" + ] + } + ], + "source": [ + "for s in sents:\n", + " print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "82da27f5-4ad0-4ec5-a20f-e83646a60fc4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Punctuation denotes the end of a sentence!\n", + "---\n", + "“But not always!”, said Fred... (who is a professor).\n", + "---\n", + "Some sentences could break over multiple lines\n", + "and begin with a lower case letter.\n", + "---\n", + "a1 could be a variable name.\n" + ] + } + ], + "source": [ + "print('\\n---\\n'.join(sents))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "aecb0078-42ae-432a-8fc5-3f1cd9baa5db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4\n", + "\n", + "Punctuation denotes the end of a sentence!\n", + "“But not always!”, said Fred... (who is a professor).\n", + "Some sentences could break over multiple lines\n", + "and begin with a lower case letter.\n", + "a1 could be a variable name.\n" + ] + } + ], + "source": [ + "text = '''Punctuation denotes the end of a sentence!\n", + "“But not always!”, said Fred... (who is a professor).\n", + "Some sentences could break over multiple lines\n", + "and begin with a lower case letter.\n", + "a1 could be a variable name.\n", + "'''\n", + "sents = nltk.sent_tokenize(text)\n", + "print(len(sents))\n", + "print()\n", + "for s in sents:\n", + " print(s)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}