From 3426cb57c642c17f01129753ba45adca062ad5af Mon Sep 17 00:00:00 2001 From: Sualeh Fatehi Date: Mon, 6 May 2024 20:38:33 -0400 Subject: [PATCH] Created using Colab --- ..._javascript_unicode_pattern_matching.ipynb | 365 ++++++++++-------- 1 file changed, 197 insertions(+), 168 deletions(-) diff --git a/Notebooks/5_javascript_unicode_pattern_matching.ipynb b/Notebooks/5_javascript_unicode_pattern_matching.ipynb index 84e20e6..b06be29 100644 --- a/Notebooks/5_javascript_unicode_pattern_matching.ipynb +++ b/Notebooks/5_javascript_unicode_pattern_matching.ipynb @@ -1,170 +1,199 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Unicode Pattern Matching" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PHrPaZuXetrR" + }, + "source": [ + "# Unicode Pattern Matching" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l35sZ-DdetrS" + }, + "source": [ + "## Case Insensitive Matching" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bpKPj0yetrT" + }, + "source": [ + "In Greek, the word for dog in lowercase is \"σκύλος\". Notice that the first and last letter are both sigma." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b0D8XmmUetrT" + }, + "outputs": [], + "source": [ + "%%script node\n", + "\n", + "const lower_greek = /σκύλος/iu\n", + "const upper_greek = \"ΣΚΎΛΟΣ\";\n", + "matches = lower_greek.test(upper_greek);\n", + "\n", + "console.log(matches);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e4kztbf8etrT" + }, + "source": [ + "When a lower -case character results in more than one uppercase character, there is no match." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IBI6YatUetrT" + }, + "outputs": [], + "source": [ + "%%script node\n", + "\n", + " const lower_german = /\"straße\"/iu\n", + " const upper_german = \"STRASSE\";\n", + " matches = lower_german.test(upper_german);\n", + "\n", + " console.log(matches);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N8U7JfYyetrT" + }, + "source": [ + "## Matching Numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kEyK5CxBetrU" + }, + "source": [ + "A naive match with a range of digits `[0-9]` does not work." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3hm1wMm7etrU" + }, + "outputs": [], + "source": [ + "%%script node\n", + "\n", + "hindiNumber = \"१२३४५६७८९०\";\n", + "\n", + "digit = /[0-9]+/\n", + "matches = digit.test(hindiNumber);\n", + "\n", + "console.log(matches);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y9AJrxKFetrU" + }, + "source": [ + "A slightly better regular expression with a `\\d` pattern does not work either." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dR9XOTsTetrU" + }, + "outputs": [], + "source": [ + "%%script node\n", + "\n", + "hindiNumber = \"१२३४५६७८९०\";\n", + "\n", + "standard_digit = /\\d+/\n", + "matches = standard_digit.test(hindiNumber);\n", + "\n", + "console.log(matches);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bFFCgcWKetrU" + }, + "source": [ + "The best way to match digits is by matching against the Unicode Decimal Number Category (Nd), using a Unicode Category pattern `\\p{Nd}`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EHmWCZjyetrU" + }, + "outputs": [], + "source": [ + "%%script node\n", + "\n", + "hindiNumber = \"१२३४५६७८९०\";\n", + "\n", + "unicode_digit = /\\p{Nd}+/u\n", + "matches = unicode_digit.test(hindiNumber);\n", + "\n", + "console.log(matches);" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + }, + "colab": { + "provenance": [], + "include_colab_link": true + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%script node\n", - "\n", - "matches = false;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Case Insensitive Matching" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In Greek, the word for dog in lowercase is \"σκύλος\". Notice that the first and last letter are both sigma." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%script node\n", - "\n", - "const lower_greek = /σκύλος/iu\n", - "const upper_greek = \"ΣΚΎΛΟΣ\";\n", - "matches = lower_greek.test(upper_greek);\n", - "\n", - "console.log(matches);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When a lower -case character results in more than one uppercase character, there is no match." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%script node\n", - "\n", - " const lower_german = /\"straße\"/iu\n", - " const upper_german = \"STRASSE\";\n", - " matches = lower_german.test(upper_german);\n", - "\n", - " console.log(matches);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Matching Numbers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A naive match with a range of digits `[0-9]` does not work." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%script node\n", - "\n", - "hindiNumber = \"१२३४५६७८९०\";\n", - "\n", - "digit = /[0-9]+/\n", - "matches = digit.test(hindiNumber);\n", - "\n", - "console.log(matches);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A slightly better regular expression with a `\\d` pattern does not work either." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%script node\n", - "\n", - "hindiNumber = \"१२३४५६७८९०\";\n", - "\n", - "standard_digit = /\\d+/\n", - "matches = standard_digit.test(hindiNumber);\n", - "\n", - "console.log(matches);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The best way to match digits is by matching against the Unicode Decimal Number Category (Nd), using a Unicode Category pattern `\\p{Nd}`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%script node\n", - "\n", - "hindiNumber = \"१२३४५६७८९०\";\n", - "\n", - "unicode_digit = /\\p{Nd}+/u\n", - "matches = unicode_digit.test(hindiNumber);\n", - "\n", - "console.log(matches);" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file