From 0054585a05e3133fca57a89d9951b48c9922cbb6 Mon Sep 17 00:00:00 2001 From: levdoescode Date: Fri, 13 Jan 2023 18:18:56 -0500 Subject: [PATCH] Notebook on regular expressions completed --- .../Week 4/2.3.1 Regular expressions.ipynb | 828 ++++++++++++++++++ 1 file changed, 828 insertions(+) create mode 100644 CM3060 Natural Language Processing/Week 4/2.3.1 Regular expressions.ipynb diff --git a/CM3060 Natural Language Processing/Week 4/2.3.1 Regular expressions.ipynb b/CM3060 Natural Language Processing/Week 4/2.3.1 Regular expressions.ipynb new file mode 100644 index 0000000..e1f0206 --- /dev/null +++ b/CM3060 Natural Language Processing/Week 4/2.3.1 Regular expressions.ipynb @@ -0,0 +1,828 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1b2088b6-a2c6-4ad7-9fdb-070684661332", + "metadata": {}, + "source": [ + "# Regular expressions\n", + "* A formal language for defining text strings (character sequences)\n", + "* Used for pattern matching (e.g. searching & replacing in text)
\n", + "1. Disjunctions\n", + "2. Negation\n", + "3. Optionality\n", + "4. Aliases\n", + "5. Anchors" + ] + }, + { + "cell_type": "markdown", + "id": "2c3429a6-43dc-4323-aa95-7fecec39464f", + "metadata": {}, + "source": [ + "# 1. Disjunctions" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7e075952-22e9-4b38-8726-5f697c103a4f", + "metadata": {}, + "outputs": [], + "source": [ + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7283f69c-63cc-4632-9df1-1bbd5596880f", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''Most of the time we can use white space.\n", + "But what about fred@gmail.com or 13/01/2021?\n", + "The students' attempts aren't working.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a3a9c07c-5b4d-4319-bda2-0c071f0b4805", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = r\"the\" # We're using the string literal syntax, what follows r is a string literal" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "01a16737-575f-4bd8-8f6c-69570e91a2d8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most of X time we can use white space.\n", + "But what about fred@gmail.com or 13/01/2021?\n", + "The students' attempts aren't working.\n", + "Maybe it's X use of apostrophes? Or it might need a more up-to-date model.\n", + "\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"X\", text)) # Substitute X for pattern in text" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7ab3b376-8735-45b8-903a-d5963db60099", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = r\"[Tt]he\" # match with upper or lowercase t" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "20eae2e9-9206-4beb-9fd4-029afd41df54", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most of X time we can use white space.\n", + "But what about fred@gmail.com or 13/01/2021?\n", + "X students' attempts aren't working.\n", + "Maybe it's X use of apostrophes? Or it might need a more up-to-date model.\n", + "\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"X\", text)) # Substitute X for pattern in text" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "67f70f86-3b95-4142-a452-ff60fdec4f68", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = r\"[0-9]\" # match on digits" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f5e5847a-6faf-409d-bd96-f3482e656f60", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most of the time we can use white space.\n", + "But what about fred@gmail.com or XX/XX/XXXX?\n", + "The students' attempts aren't working.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"X\", text)) # Substitute X for pattern in text" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cf1bba29-cbeb-4bd6-83be-390f3423954b", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = r\"[A-Z]\" # match on uppercase letters in the range A to Z" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6263578b-e6ac-42ea-bc6e-53838594e778", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Xost of the time we can use white space.\n", + "Xut what about fred@gmail.com or 13/01/2021?\n", + "Xhe students' attempts aren't working.\n", + "Xaybe it's the use of apostrophes? Xr it might need a more up-to-date model.\n", + "\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"X\", text)) # Substitute X for pattern in text" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "cb52d0db-fb29-48e1-9f96-3caa4e11eb94", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = r\"of|the|we\" # Match on a list of words" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "59325f92-3e2f-4652-bb7a-124f905695c7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most X X time X can use white space.\n", + "But what about fred@gmail.com or 13/01/2021?\n", + "The students' attempts aren't working.\n", + "Maybe it's X use X apostrophes? Or it might need a more up-to-date model.\n", + "\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"X\", text)) # Substitute X for pattern in text" + ] + }, + { + "cell_type": "markdown", + "id": "68389f1d-e605-4ff9-acbf-89d2d0469a64", + "metadata": {}, + "source": [ + "# 2. Negation" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7c9599ae-3162-4b09-9d69-183f806edb09", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''Most of the time we can use white space.\n", + "But what about fred@gmail.com or 13/01/2021?\n", + "The students' attempts aren't working.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6e18db87-d15a-4395-8610-a08dfd00daf1", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = r\"[^0-9]\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "72d7d0b8-663d-4209-b358-d8d67d5467c0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13012021\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"\", text)) # Substitute with empty for pattern in text" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "19ec2edd-f671-4d60-a5d6-df094c5a17fd", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = r\"[^a-z]\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f55e3f7c-e2ff-46e0-806b-8afd2bd6b8dd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ost of the time we can use white space ut what about fred gmail com or he students attempts aren t working aybe it s the use of apostrophes r it might need a more up to date model \n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \" \", text)) # Substitute with empty for pattern in text" + ] + }, + { + "cell_type": "markdown", + "id": "a5ce7b61-be11-4ff0-8a64-c4e191ef2f0e", + "metadata": {}, + "source": [ + "# 3. Optionality" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "bd2bdf73-1cd7-40d1-8e21-a31cb54e4a6e", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''begin began begun beginning'''" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "629a5a13-d28c-4dda-8cc0-11ef642522f1", + "metadata": {}, + "outputs": [], + "source": [ + "# . means match anything, like a wildcard\n", + "pattern = r\"beg.n\" # 'beg' followed by anything and followed by n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ce857270-9737-48a1-b240-c577857ac1c0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X X X Xning\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"X\", text))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "97850c49-7503-4809-8de7-8d6674e850fd", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''colour can be spelled color'''" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "75c82145-1d37-48d5-ac09-9d41edbc8032", + "metadata": {}, + "outputs": [], + "source": [ + "# ? means previous character is optional\n", + "pattern = r\"colou?r\"" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e581547a-9702-42f4-8900-5ba378e81c06", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " can be spelled \n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"\", text))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "042afb72-6384-4b8c-a3f3-4ce5be65d4d0", + "metadata": {}, + "outputs": [], + "source": [ + "# * is the Kleene star, meaning match 0 or more of previous character\n", + "pattern = r\"w.*\"" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "c2e8acc9-f99b-44ce-a192-fb0ea20c022b", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''Most of the time we can use white space.\n", + "But what about fred@gmail.com or 13/01/2021?\n", + "The students' attempts aren't working.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "4fafab29-1b2e-445c-ba27-298dbf6dd93c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most of the time \n", + "But \n", + "The students' attempts aren't \n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"\", text))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "908f46ea-21f7-4aa2-a082-b4363aef8bdd", + "metadata": {}, + "outputs": [], + "source": [ + "# make sure the match is non-greedy using the ? character\n", + "pattern = r\"w.*?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "8c1fa3cf-65fb-4b2b-9e42-43c9e67d6d89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most of the time Xe can use Xhite space.\n", + "But Xhat about fred@gmail.com or 13/01/2021?\n", + "The students' attempts aren't Xorking.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"X\", text))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "b1afabfa-2e93-4ae1-9456-b1caa4b285f3", + "metadata": {}, + "outputs": [], + "source": [ + "# make sure the match is non-greedy using the ? character, the whole word\n", + "pattern = r\"w.*? \"" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "305520e2-21ef-455e-850a-d560db508279", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most of the time Xcan use Xspace.\n", + "But Xabout fred@gmail.com or 13/01/2021?\n", + "The students' attempts aren't working.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"X\", text))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "dc5cfc4f-890f-48d9-a9d2-a5417dc47d30", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''foo fooo foooo fooooo!'''" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "fc96a086-0e77-451a-bc70-05be854fdc70", + "metadata": {}, + "outputs": [], + "source": [ + "# + is the Kleene plus, meaning match 1 or more of the previous characters\n", + "pattern = r\"fooo+\"" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "d3594bb1-4668-4b28-a765-83cd633cc369", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "foo X X X!\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"X\", text))" + ] + }, + { + "cell_type": "markdown", + "id": "29064c36-95c8-4c62-9fde-21a8b7e5ddb9", + "metadata": {}, + "source": [ + "# 4. Aliases\n", + "Shortcuts\n", + "```\n", + "\\w - match word\n", + "\\d - match digit\n", + "\\s - match whitespace\n", + "\\W - match not word\n", + "\\D - match not digit\n", + "\\S - match not whitespace\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "73bb5e8a-2b90-477f-b7c6-8562f69abf8e", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''Most of the time we can use white space.\n", + "But what about fred@gmail.com or 13/01/2021?\n", + "The students' attempts aren't working.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "e92a698d-509e-499b-8dee-052d007c309b", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = r\"\\w\"" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "b2df707c-c75f-4ec6-808e-969dca4e5b54", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " .\n", + " @. //?\n", + " ' ' .\n", + " ' ? -- .\n", + "\n" + ] + } + ], + "source": [ + "# match all word characters\n", + "print(re.sub(pattern, \"\", text))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "d25c8164-2eb9-4fc0-acfc-c3a6d779a314", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = r\"\\d\"" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "47a39565-955e-48de-bb4b-6d333a6c0597", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most of the time we can use white space.\n", + "But what about fred@gmail.com or XX/XX/XXXX?\n", + "The students' attempts aren't working.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"X\", text))" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "c4919c02-5f03-499f-9b7d-cf6e3f3b5961", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = r\"\\D\"" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "49d9a61a-3b64-4aef-af32-fb31b8238418", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13012021\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"\", text))" + ] + }, + { + "cell_type": "markdown", + "id": "94ceba75-eaf1-4eff-94df-83136b3a3479", + "metadata": {}, + "source": [ + "# 5. Anchors" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "4949428a-4c53-41f3-8158-4104c8f77da0", + "metadata": {}, + "outputs": [], + "source": [ + "# delete all words\n", + "pattern = \"\\w+\"" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "5402a2e6-315d-4c20-a47b-30ba86819193", + "metadata": { + "jupyter": { + "source_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " .\n", + " @. //?\n", + " ' ' .\n", + " ' ? -- .\n", + "\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"\", text))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "8eb6e9e7-72a9-46aa-a9d1-8467d98e5e4e", + "metadata": {}, + "outputs": [], + "source": [ + "# delete only words at the start of a string\n", + "pattern = \"^\\w+\"" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "0566be5e-c17b-4603-b877-f3036ea9bb80", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " of the time we can use white space.\n", + "But what about fred@gmail.com or 13/01/2021?\n", + "The students' attempts aren't working.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"\", text))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "764907c7-45ad-4934-886f-275696252a00", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " of the time we can use white space.\n", + " what about fred@gmail.com or 13/01/2021?\n", + " students' attempts aren't working.\n", + " it's the use of apostrophes? Or it might need a more up-to-date model.\n", + "\n" + ] + } + ], + "source": [ + "# switch on multiline mode to delete words at the start of each line\n", + "print(re.sub(pattern, \"\", text, flags=re.MULTILINE))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "3bcacef3-7c01-4133-9028-100100b3a196", + "metadata": {}, + "outputs": [], + "source": [ + "# use $ to anchr the match at the end of a string\n", + "pattern = \"\\W$\"" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "b4662f11-aeb2-4734-b131-240987dd77a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most of the time we can use white space.\n", + "But what about fred@gmail.com or 13/01/2021?\n", + "The students' attempts aren't working.\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model\n" + ] + } + ], + "source": [ + "print(re.sub(pattern, \"\", text))" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "8afed181-1705-4933-a0d0-4ea5e10ca904", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most of the time we can use white space\n", + "But what about fred@gmail.com or 13/01/2021\n", + "The students' attempts aren't working\n", + "Maybe it's the use of apostrophes? Or it might need a more up-to-date model\n" + ] + } + ], + "source": [ + "# switch on multiline mode to delete non-words at the end of each line\n", + "print(re.sub(pattern, \"\", text, flags=re.MULTILINE))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}