From c322a7600570522f3eba0092bcc3505ad8c57ab3 Mon Sep 17 00:00:00 2001 From: FerroEduardo <47820549+FerroEduardo@users.noreply.github.com> Date: Sun, 26 Feb 2023 13:07:00 -0300 Subject: [PATCH 1/4] Fixes normaliser and emoji dependency --- enelvo/preprocessing/preprocessing.py | 4 ++-- enelvo/preprocessing/tokenizer/tokenizer.py | 5 ++--- pyproject.toml | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/enelvo/preprocessing/preprocessing.py b/enelvo/preprocessing/preprocessing.py index 44ecee2..180d981 100644 --- a/enelvo/preprocessing/preprocessing.py +++ b/enelvo/preprocessing/preprocessing.py @@ -1,6 +1,6 @@ """Preprocessing methods.""" from .tokenizer import Tokenizer -from emoji import UNICODE_EMOJI +import emoji import string import os.path @@ -70,7 +70,7 @@ def sanitize(text, as_string=False): clean = [ w.strip() for w in tokens - if w not in emoticons and w not in UNICODE_EMOJI and len(w) != 0 + if w not in emoticons and emoji.emoji_count(w) == 0 and len(w) != 0 ] return clean if not as_string else " ".join(clean) diff --git a/enelvo/preprocessing/tokenizer/tokenizer.py b/enelvo/preprocessing/tokenizer/tokenizer.py index 0491461..fee9b22 100644 --- a/enelvo/preprocessing/tokenizer/tokenizer.py +++ b/enelvo/preprocessing/tokenizer/tokenizer.py @@ -9,9 +9,8 @@ # Adapted to Portuguese by Thales Bertaglia import re -import string from os import path -from emoji import UNICODE_EMOJI +import emoji from html.entities import name2codepoint @@ -72,7 +71,7 @@ def _isemoji(s): len(s) == len("\U0001f4a9") and any(l <= s <= u for l, u in emoji_ranges) or s in emoji_flags - or s in UNICODE_EMOJI + or emoji.emoji_count(s) > 0 ) diff --git a/pyproject.toml b/pyproject.toml index 8423f43..8d8a96b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ editdistance = ">=0.6.0" numpy = ">=1.19.5" gensim = ">=4.1.2" tabulate = ">=0.8.9" -emoji = ">=1.6.3" +emoji = "2.2.0" [tool.poetry.dev-dependencies] coveralls = ">=3.3.1" From 8f3428dbb3ff101a2b4fa6d6e2e7e62c290684ab Mon Sep 17 00:00:00 2001 From: FerroEduardo <47820549+FerroEduardo@users.noreply.github.com> Date: Sun, 26 Feb 2023 13:07:12 -0300 Subject: [PATCH 2/4] Create normaliser tests --- tests/test_normaliser.py | 58 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tests/test_normaliser.py diff --git a/tests/test_normaliser.py b/tests/test_normaliser.py new file mode 100644 index 0000000..3694374 --- /dev/null +++ b/tests/test_normaliser.py @@ -0,0 +1,58 @@ +from enelvo import normaliser + +def test_raw(): + norm = normaliser.Normaliser() + assert norm.normalise('Que dia lindo') == 'que dia lindo' + assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀' + assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo' + assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb' + assert norm.normalise('#python > #javascript') == 'hashtag > hashtag' + +def test_sanitize(): + norm = normaliser.Normaliser(sanitize=True) + assert norm.normalise('Que dia lindo') == 'que dia lindo' + assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha' + assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo' + assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb' + assert norm.normalise('#python > #javascript') == 'hashtag hashtag' + +def test_capitalize_pns(): + norm = normaliser.Normaliser(capitalize_pns=True) + assert norm.normalise('Que dia lindo') == 'que dia lindo' + assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀' + assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do Eduardo' + assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb' + assert norm.normalise('#python > #javascript') == 'Hashtag > Hashtag' + +def test_capitalize_inis(): + norm = normaliser.Normaliser(capitalize_inis=True) + assert norm.normalise('Que dia lindo') == 'Que dia lindo' + assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'Hoje eu vou dar uma voltinha 😀' + assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'Vou dar uma passada na casa do eduardo' + assert norm.normalise('Domingo é dia de paredão no BBB') == 'Domingo é dia de paredão no bbb' + assert norm.normalise('#python > #javascript') == 'Hashtag > hashtag' + +def test_capitalize_acs(): + norm = normaliser.Normaliser(capitalize_acs=True) + assert norm.normalise('Que dia lindo') == 'que dia lindo' + assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀' + assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo' + assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no BBB' + assert norm.normalise('#python > #javascript') == 'hashtag > hashtag' + +def test_readable_tokenizer(): + norm = normaliser.Normaliser(tokenizer='readable') + assert norm.normalise('Que dia lindo') == 'que dia lindo' + assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀' + assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo' + assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb' + assert norm.normalise('#python > #javascript') == '#python > #javascript' + +def test_all(): + norm = normaliser.Normaliser(sanitize=True, capitalize_pns=True, capitalize_inis=True, capitalize_acs=True, tokenizer='readable') + assert norm.normalise('Que dia lindo') == 'Que dia lindo' + assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'Hoje eu vou dar uma voltinha' + assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'Vou dar uma passada na casa do Eduardo' + assert norm.normalise('Domingo é dia de paredão no BBB') == 'Domingo é dia de paredão no BBB' + assert norm.normalise('#python > #javascript') == 'python javascript' + \ No newline at end of file From e72f3f601f84c83a1ab8bab12b2fb61dad741f22 Mon Sep 17 00:00:00 2001 From: FerroEduardo <47820549+FerroEduardo@users.noreply.github.com> Date: Sun, 26 Feb 2023 13:09:46 -0300 Subject: [PATCH 3/4] Remove unused imports --- enelvo/candidate_generation/embeddings.py | 2 -- enelvo/candidate_scoring/embeddings.py | 1 - enelvo/utils/evaluation.py | 2 -- 3 files changed, 5 deletions(-) diff --git a/enelvo/candidate_generation/embeddings.py b/enelvo/candidate_generation/embeddings.py index 3724379..3d8c074 100644 --- a/enelvo/candidate_generation/embeddings.py +++ b/enelvo/candidate_generation/embeddings.py @@ -2,11 +2,9 @@ # Author: Thales Bertaglia -import gensim import pickle from enelvo import metrics from enelvo import candidate_scoring -from enelvo import utils from enelvo.candidate_generation import baselines diff --git a/enelvo/candidate_scoring/embeddings.py b/enelvo/candidate_scoring/embeddings.py index 7360e37..2d9331f 100644 --- a/enelvo/candidate_scoring/embeddings.py +++ b/enelvo/candidate_scoring/embeddings.py @@ -2,7 +2,6 @@ # Author: Thales Bertaglia -import pickle from enelvo import metrics diff --git a/enelvo/utils/evaluation.py b/enelvo/utils/evaluation.py index 103c256..57ed580 100644 --- a/enelvo/utils/evaluation.py +++ b/enelvo/utils/evaluation.py @@ -2,8 +2,6 @@ # Author: Thales Bertaglia -from tabulate import tabulate - def evaluate_candidate_generation(list_corrections, list_candidates): """Returns the recall (in %) of candidate generation methods. From 6147ce6eed05aeb68a7ae5bcf358c0b2e201a2af Mon Sep 17 00:00:00 2001 From: FerroEduardo <47820549+FerroEduardo@users.noreply.github.com> Date: Sun, 26 Feb 2023 13:19:58 -0300 Subject: [PATCH 4/4] Automatically run tests --- .github/workflows/tests.yaml | 29 +++++++++++++++++++++++++++++ .travis.yml | 2 ++ README.md | 1 + 3 files changed, 32 insertions(+) create mode 100644 .github/workflows/tests.yaml diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml new file mode 100644 index 0000000..17958c2 --- /dev/null +++ b/.github/workflows/tests.yaml @@ -0,0 +1,29 @@ +name: test + +on: + push: + branches: + - master + pull_request: + +jobs: + build: + runs-on: ubuntu-20.04 + strategy: + matrix: + python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install . + python -m pip install pytest pytest-cov + - name: Test with pytest + run: | + pytest tests/ --cov=enelvo/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 24ee624..963d390 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,8 @@ python: - "3.6" - "3.7" - "3.8" + - "3.9" + - "3.10" before_install: - pip install poetry diff --git a/README.md b/README.md index f7798ea..458329c 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@

A flexible normaliser for user-generated content in Portuguese.

+tests Build Status Coverage Status Code style: black