-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #138 from FerroEduardo/enhancements
Fixes emoji module, create normaliser tests and add automatic test
- Loading branch information
Showing
10 changed files
with
95 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
name: test | ||
|
||
on: | ||
push: | ||
branches: | ||
- master | ||
pull_request: | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-20.04 | ||
strategy: | ||
matrix: | ||
python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"] | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
python -m pip install . | ||
python -m pip install pytest pytest-cov | ||
- name: Test with pytest | ||
run: | | ||
pytest tests/ --cov=enelvo/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,8 @@ python: | |
- "3.6" | ||
- "3.7" | ||
- "3.8" | ||
- "3.9" | ||
- "3.10" | ||
|
||
before_install: | ||
- pip install poetry | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,11 +2,9 @@ | |
|
||
# Author: Thales Bertaglia <[email protected]> | ||
|
||
import gensim | ||
import pickle | ||
from enelvo import metrics | ||
from enelvo import candidate_scoring | ||
from enelvo import utils | ||
from enelvo.candidate_generation import baselines | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,6 @@ | |
|
||
# Author: Thales Bertaglia <[email protected]> | ||
|
||
import pickle | ||
from enelvo import metrics | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,9 +9,8 @@ | |
# Adapted to Portuguese by Thales Bertaglia <[email protected]> | ||
|
||
import re | ||
import string | ||
from os import path | ||
from emoji import UNICODE_EMOJI | ||
import emoji | ||
|
||
from html.entities import name2codepoint | ||
|
||
|
@@ -72,7 +71,7 @@ def _isemoji(s): | |
len(s) == len("\U0001f4a9") | ||
and any(l <= s <= u for l, u in emoji_ranges) | ||
or s in emoji_flags | ||
or s in UNICODE_EMOJI | ||
or emoji.emoji_count(s) > 0 | ||
) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,8 +2,6 @@ | |
|
||
# Author: Thales Bertaglia <[email protected]> | ||
|
||
from tabulate import tabulate | ||
|
||
|
||
def evaluate_candidate_generation(list_corrections, list_candidates): | ||
"""Returns the recall (in %) of candidate generation methods. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from enelvo import normaliser | ||
|
||
def test_raw(): | ||
norm = normaliser.Normaliser() | ||
assert norm.normalise('Que dia lindo') == 'que dia lindo' | ||
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀' | ||
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo' | ||
assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb' | ||
assert norm.normalise('#python > #javascript') == 'hashtag > hashtag' | ||
|
||
def test_sanitize(): | ||
norm = normaliser.Normaliser(sanitize=True) | ||
assert norm.normalise('Que dia lindo') == 'que dia lindo' | ||
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha' | ||
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo' | ||
assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb' | ||
assert norm.normalise('#python > #javascript') == 'hashtag hashtag' | ||
|
||
def test_capitalize_pns(): | ||
norm = normaliser.Normaliser(capitalize_pns=True) | ||
assert norm.normalise('Que dia lindo') == 'que dia lindo' | ||
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀' | ||
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do Eduardo' | ||
assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb' | ||
assert norm.normalise('#python > #javascript') == 'Hashtag > Hashtag' | ||
|
||
def test_capitalize_inis(): | ||
norm = normaliser.Normaliser(capitalize_inis=True) | ||
assert norm.normalise('Que dia lindo') == 'Que dia lindo' | ||
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'Hoje eu vou dar uma voltinha 😀' | ||
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'Vou dar uma passada na casa do eduardo' | ||
assert norm.normalise('Domingo é dia de paredão no BBB') == 'Domingo é dia de paredão no bbb' | ||
assert norm.normalise('#python > #javascript') == 'Hashtag > hashtag' | ||
|
||
def test_capitalize_acs(): | ||
norm = normaliser.Normaliser(capitalize_acs=True) | ||
assert norm.normalise('Que dia lindo') == 'que dia lindo' | ||
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀' | ||
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo' | ||
assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no BBB' | ||
assert norm.normalise('#python > #javascript') == 'hashtag > hashtag' | ||
|
||
def test_readable_tokenizer(): | ||
norm = normaliser.Normaliser(tokenizer='readable') | ||
assert norm.normalise('Que dia lindo') == 'que dia lindo' | ||
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀' | ||
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo' | ||
assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb' | ||
assert norm.normalise('#python > #javascript') == '#python > #javascript' | ||
|
||
def test_all(): | ||
norm = normaliser.Normaliser(sanitize=True, capitalize_pns=True, capitalize_inis=True, capitalize_acs=True, tokenizer='readable') | ||
assert norm.normalise('Que dia lindo') == 'Que dia lindo' | ||
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'Hoje eu vou dar uma voltinha' | ||
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'Vou dar uma passada na casa do Eduardo' | ||
assert norm.normalise('Domingo é dia de paredão no BBB') == 'Domingo é dia de paredão no BBB' | ||
assert norm.normalise('#python > #javascript') == 'python javascript' | ||
|