Skip to content

Commit

Permalink
Shard ZWJ sequence tables into separate files (#161)
Browse files Browse the repository at this point in the history
  • Loading branch information
ajalt authored Mar 12, 2024
1 parent b0eb592 commit 192f7e2
Show file tree
Hide file tree
Showing 8 changed files with 1,978 additions and 2,835 deletions.
2 changes: 1 addition & 1 deletion gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ coroutines-core = { module = "org.jetbrains.kotlinx:kotlinx-coroutines-core", ve
# used in tests
kotest = "io.kotest:kotest-assertions-core:5.8.0"
systemrules = "com.github.stefanbirkner:system-rules:1.19.0"
r8 = "com.android.tools:r8:8.1.72"
r8 = "com.android.tools:r8:8.3.37"
coroutines-test = { module = "org.jetbrains.kotlinx:kotlinx-coroutines-test", version.ref = "coroutines" }

# build logic
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Empty file.
39 changes: 24 additions & 15 deletions scripts/generate_emoji_sequence_table.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
# Downloads the latest emoji sequence definitions from unicode.org and generates a Kotlin file with
# that stores them as a trie. To avoid the generated file being too large, the sequences are split
# into shards. Each shard should be a separate file.

import re

import requests

emoji_zwj_url = "https://unicode.org/Public/emoji/latest/emoji-zwj-sequences.txt"
emoji_seq_url = "https://unicode.org/Public/emoji/latest/emoji-sequences.txt"
SHARD_SIZE = 500


def _parse_file(url: str, type_to_emit: str) -> list[tuple[list[int], str]]:
Expand Down Expand Up @@ -74,25 +79,29 @@ def main():
internal val EMOJI_SEQUENCES: IntTrie = buildSeqTrie()
private fun buildSeqTrie(): IntTrie {
val sequences = arrayOf("""
)
for s in seqs:
print(f' intArrayOf({", ".join(hex(it) for it in s[0])}), // {s[1]}')

print(
""" )
val root = IntTrie()
for (seq in sequences) {
var node = root
for (i in 0 until seq.lastIndex) {
node = node.children.getOrPut(seq[i]) { IntTrie() }
for (sequences in arrayOf(""", end="")
print(", ".join(f"sequences{i + 1}()" for i in range(0, len(seqs) // SHARD_SIZE + 1)), end="")
print(""")) {
for (seq in sequences) {
var node = root
for (i in 0..<seq.lastIndex) {
node = node.children.getOrPut(seq[i]) { IntTrie() }
}
node.values += seq.last()
}
node.values += seq.last()
}
return root
}"""
)
}
"""
)

for i, seq in enumerate(seqs):
if i % SHARD_SIZE == 0:
print(f"internal fun sequences{i // SHARD_SIZE + 1}(): Array<IntArray> = arrayOf(")
print(f" intArrayOf({', '.join(hex(it) for it in seq[0])}), // {seq[1]}")
if i % SHARD_SIZE == SHARD_SIZE - 1 or i == len(seqs) - 1:
print(")\n")


if __name__ == "__main__":
Expand Down

0 comments on commit 192f7e2

Please sign in to comment.