Skip to content

Commit

Permalink
R1.0 prep (#1)
Browse files Browse the repository at this point in the history
* add logging for errors/warnings with ksqldb's provided log libs

* add two new udfs to convert emojis to aliases or codepoints

* externalize test samples into json files read using MethodSource

* refactor tests to use externalized test samples

* doc update/pre-format fix + version bump
  • Loading branch information
hpgrahsl authored Apr 23, 2020
1 parent 1481cd5 commit fff7e64
Show file tree
Hide file tree
Showing 23 changed files with 1,148 additions and 169 deletions.
77 changes: 70 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,36 @@ Variations :
specificEmojis: a list of specific emojis to remove
```

##### EMOJIS_TO_ALIASES

```
Overview : leverages the emoji-java library to replace emojis contained in a string by their textual aliases
Type : SCALAR
Variations :
Variation : EMOJIS_TO_ALIASES(text VARCHAR, fpAction VARCHAR)
Returns : VARCHAR
Description : replace emojis contained in a string by their textual aliases
text : the given text in which to replace any(!) emojis by their textual aliases
fpAction : how to deal with Fitzpatrick modifiers, must be either PARSE, REMOVE or IGNORE
```

##### EMOJIS_TO_HTMLCODEPOINTS

```
Version : 1.0.0
Overview : leverages the emoji-java library to replace emojis contained in a string by their HTML codepoints
Type : SCALAR
Variations :
Variation : EMOJIS_TO_HTMLCODEPOINTS(text VARCHAR, fpAction VARCHAR, encoding VARCHAR)
Returns : VARCHAR
Description : replace emojis contained in a string by their HTML codepoints
text : the given text in which to replace any(!) emojis by their HTML codepoints
fpAction : how to deal with Fitzpatrick modifiers, must be one of: PARSE, REMOVE, IGNORE
encoding : which HTML codepoints representation to use, must be one of: HEX, DEC
```

### Examples

The UDF call examples below are based on the following pre-defined sample content:
Expand Down Expand Up @@ -147,21 +177,54 @@ ksql> SELECT id,content,EMOJIS_REMOVE(content) AS result FROM examples EMIT CHAN
^CQuery terminated
```

##### EMOJIS_TO_ALIASES

```
ksql> SELECT id,content,EMOJIS_TO_ALIASES(content,'PARSE') AS result FROM examples EMIT CHANGES;
+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+
|ID |CONTENT |RESULT |
+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+
|1 |null |null |
|2 | | |
|3 |This is text without any emojis. |This is text without any emojis. |
|4 |🤓🤓This 🤩 is text🌻🌺🍄🍄with🎸🚀emojis🚀🚀.👏 |:nerd::nerd:This :star_struck: is text:sunflower::hibisc|
| | |us::mushroom::mushroom:with:guitar::rocket:emojis:rocket|
| | |::rocket:.:clap: |
^CQuery terminated
```

##### EMOJIS_TO_HTMLCODEPOINTS

```
+---------------------------------------------+-------------------------------------------------+---------------------------------------------+---------------------------------------------+
|ID |CONTENT |RESULT1 |RESULT2 |
+---------------------------------------------+-------------------------------------------------+---------------------------------------------+---------------------------------------------+
|1 |null |null |null |
|2 | | | |
|3 |This is text without any emojis. |This is text without any emojis. |This is text without any emojis. |
|4 |🤓🤓This 🤩 is text🌻🌺🍄🍄with🎸🚀emojis🚀🚀 |🤓🤓This 🤩 is text&#x1f|🤓🤓This 🤩 is text&#127|
| |.👏 |33b;🌺🍄🍄with🎸&|803;🌺🍄🍄with🎸&|
| | |#x1f680;emojis🚀🚀.👏 |#128640;emojis🚀🚀.👏 |
^CQuery terminated
```

### Installation / Deployment

1. You can either build the Maven project from sources or download the latest snapshot release as self-contained jar from [here](https://drive.google.com/file/d/167vjGKp99cQfppfWh5YrrBziI2Kiua0l/view?usp=sharing).
2. Move the `emoji-functions-1.0-SNAPSHOT.jar` file into a folder of your ksqlDB installation that is configured to load custom functions from during server bootstrap.
1. You can either build the Maven project from sources or download the latest release as self-contained jar from [here](https://drive.google.com/file/d/1NkXitI9fer6OmqVsFZnMPhDGHNo2tsu9/view?usp=sharing).
2. Move the `emoji-functions-1.0.jar` file into a folder of your ksqlDB installation that is configured to load custom functions from during server bootstrap.
3. (Re)Start your ksqlDB server instance(s) to make it pick up and load the emoji functions.
4. Verify if the deployment was successful by opening a ksqlDB CLI session and running `SHOW FUNCTIONS;` which should amongst all other available functions list the following **emoji-related UDFs**:

```
Function Name | Type
Function Name | Type
-----------------------------------
...
EMOJIS_CONTAINED | SCALAR
EMOJIS_COUNT | SCALAR
EMOJIS_EXTRACT | SCALAR
EMOJIS_REMOVE | SCALAR
EMOJIS_CONTAINED | SCALAR
EMOJIS_COUNT | SCALAR
EMOJIS_EXTRACT | SCALAR
EMOJIS_REMOVE | SCALAR
EMOJIS_TO_ALIASES | SCALAR
EMOJIS_TO_HTMLCODEPOINTS | SCALAR
...
-----------------------------------
```
Expand Down
18 changes: 17 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

<groupId>com.github.hpgrahsl.ksqldb.functions</groupId>
<artifactId>emoji-functions</artifactId>
<version>1.0-SNAPSHOT</version>
<version>1.0</version>
<packaging>jar</packaging>

<name>Utility functions for handling emojis within ksqlDB</name>
Expand All @@ -34,6 +34,7 @@
<emoji-java.version>5.1.1</emoji-java.version>
<junit.jupiter.version>5.6.1</junit.jupiter.version>
<maven.shade.version>3.2.1</maven.shade.version>
<javax.json.version>1.1.4</javax.json.version>
<!-- JUnit 5 requires Surefire version 2.22.1 or higher -->
<maven.surefire.version>2.22.1</maven.surefire.version>
</properties>
Expand Down Expand Up @@ -85,6 +86,15 @@
<goals>
<goal>shade</goal>
</goals>
<configuration>
<artifactSet>
<excludes>
<exclude>org.slf4j:slf4j-api:jar:</exclude>
<exclude> org.slf4j:slf4j-log4j12:jar:</exclude>
<exclude>log4j:log4j:jar:</exclude>
</excludes>
</artifactSet>
</configuration>
</execution>
</executions>
</plugin>
Expand Down Expand Up @@ -115,5 +125,11 @@
<version>${junit.jupiter.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.glassfish</groupId>
<artifactId>javax.json</artifactId>
<version>${javax.json.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import io.confluent.ksql.function.udf.Udf;
import io.confluent.ksql.function.udf.UdfDescription;
import io.confluent.ksql.function.udf.UdfParameter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashSet;
import java.util.List;
Expand All @@ -33,12 +35,18 @@
)
public class UdfEmojisContained {

private static final Logger LOGGER = LoggerFactory.getLogger(UdfEmojisContained.class);

@Udf(description = "checks whether or not the given string contains emojis")
public Boolean containsEmojis(
@UdfParameter(value = "text", description = "the given text in which to check for any(!) emoji occurrences")
final String text) {

return text == null ? null : EmojiManager.containsEmoji(text);
if(text == null) {
LOGGER.warn("the UDF parameter ('text') was null which is probably not intended");
return null;
}
return EmojiManager.containsEmoji(text);

}

Expand All @@ -49,8 +57,10 @@ public Boolean containsEmojis(
@UdfParameter(value = "specificEmojis", description = "a list of specific emojis to look for")
final List<String> specificEmojis) {

if(text == null)
if(text == null || specificEmojis == null) {
LOGGER.warn("any of the UDF parameters ('text','specificEmojis') was null which is probably not intended");
return null;
}

var containedEmojis = new HashSet<>(EmojiParser.extractEmojis(text));
containedEmojis.retainAll(new HashSet<>(specificEmojis));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import io.confluent.ksql.function.udf.Udf;
import io.confluent.ksql.function.udf.UdfDescription;
import io.confluent.ksql.function.udf.UdfParameter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashSet;

Expand All @@ -31,15 +33,19 @@
)
public class UdfEmojisCount {

private static final Logger LOGGER = LoggerFactory.getLogger(UdfEmojisCount.class);

@Udf(description = "counts the number of potentially contained emojis with or without duplicates from the given string")
public Integer countEmojis(
@UdfParameter(value = "text", description = "the given text in which to count emojis")
final String text,
@UdfParameter(value = "unique", description = "if true will return count of unique emojis, if false counts all emojis i.e. also duplicates")
final boolean unique) {

if(text == null)
if(text == null) {
LOGGER.warn("the UDF parameter ('text') was null which is probably not intended");
return null;
}

return !unique
? EmojiParser.extractEmojis(text).size()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import io.confluent.ksql.function.udf.Udf;
import io.confluent.ksql.function.udf.UdfDescription;
import io.confluent.ksql.function.udf.UdfParameter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.LinkedHashSet;
Expand All @@ -33,15 +35,19 @@
)
public class UdfEmojisExtract {

private static final Logger LOGGER = LoggerFactory.getLogger(UdfEmojisExtract.class);

@Udf(description = "extracts a list of potentially contained emojis with or without duplicates from the given string")
public List<String> extractEmojis(
@UdfParameter(value = "text", description = "the given text to extract emojis from")
final String text,
@UdfParameter(value = "unique", description = "if true will return only unique emojis (set semantic), if false every emoji i.e. also duplicate ones (list semantic) will be returned")
final boolean unique) {

if(text == null)
if(text == null) {
LOGGER.warn("the UDF parameter ('text') was null which is probably not intended");
return null;
}

return !unique
? EmojiParser.extractEmojis(text)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import io.confluent.ksql.function.udf.Udf;
import io.confluent.ksql.function.udf.UdfDescription;
import io.confluent.ksql.function.udf.UdfParameter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.List;
import java.util.stream.Collectors;
Expand All @@ -33,12 +35,18 @@
)
public class UdfEmojisRemove {

private static final Logger LOGGER = LoggerFactory.getLogger(UdfEmojisRemove.class);

@Udf(description = "removes emojis contained in a string")
public String removeEmojis(
@UdfParameter(value = "text", description = "the given text from which to remove any(!) emojis")
final String text) {

return text == null ? null : EmojiParser.removeAllEmojis(text);
if(text == null) {
LOGGER.warn("the UDF parameter ('text') was null which is probably not intended");
return null;
}
return EmojiParser.removeAllEmojis(text);

}

Expand All @@ -49,13 +57,16 @@ public String removeEmojis(
@UdfParameter(value = "specificEmojis", description = "a list of specific emojis to remove")
final List<String> specificEmojis) {

return text == null
? null
: EmojiParser.removeEmojis(text,
if(text == null || specificEmojis == null) {
LOGGER.warn("any of the UDF parameters ('text','specificEmojis') was null which is probably not intended");
return null;
}

return EmojiParser.removeEmojis(text,
specificEmojis.stream()
.map(EmojiManager::getByUnicode)
.collect(Collectors.toList())
);
);

}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Copyright (c) 2020. Hans-Peter Grahsl ([email protected])
*
* Licensed under the MIT License (the "License").
* You may not use this file except in compliance with the License.
* You may obtain a copy of the License at: https://opensource.org/licenses/MIT
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
* OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
*/

package com.github.hpgrahsl.ksqldb.functions;

import com.vdurmont.emoji.EmojiParser;
import io.confluent.ksql.function.udf.Udf;
import io.confluent.ksql.function.udf.UdfDescription;
import io.confluent.ksql.function.udf.UdfParameter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@UdfDescription(
name = "emojis_to_aliases",
description = "leverages the emoji-java library to replace emojis contained in a string by their textual aliases",
author = "Hans-Peter Grahsl (follow @hpgrahsl)",
version = "1.0.0"
)
public class UdfEmojisToAliases {

private static final Logger LOGGER = LoggerFactory.getLogger(UdfEmojisToAliases.class);

@Udf(description = "replace emojis contained in a string by their textual aliases")
public String replaceEmojisWithAliases(
@UdfParameter(value = "text", description = "the given text in which to replace any(!) emojis by their textual aliases")
final String text,
@UdfParameter(value = "fpAction",description = "how to deal with Fitzpatrick modifiers, must be either PARSE, REMOVE or IGNORE")
final String fpAction) {

if(text == null || fpAction == null) {
LOGGER.warn("any of the UDF parameters ('text','fpAction') was null which is probably not intended");
return null;
}

try {
return EmojiParser.parseToAliases(text, EmojiParser.FitzpatrickAction.valueOf(fpAction.toUpperCase()));
} catch(IllegalArgumentException e) {
LOGGER.error("the UDF parameter (fpAction '"+fpAction+"') is invalid", e);
return null;
}

}

}
Loading

0 comments on commit fff7e64

Please sign in to comment.