forked from google-research/google-research
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial import of the neural, lexical and entropic measures.
PiperOrigin-RevId: 380089862
- Loading branch information
1 parent
65d81c4
commit f164714
Showing
28 changed files
with
9,919 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
This directory contains the supporting code for the respective paper currently | ||
under review. Please check back for details. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# Bazel configuration file. | ||
# ------------------------- | ||
# Based on TensorFlow options in: | ||
# https://github.com/tensorflow/tensorflow/blob/master/.bazelrc | ||
# | ||
# Compiler options: | ||
# c++17: Build with C++17 options (links with libc++) | ||
# c++1z: Build with C++17 options (links with libc++) | ||
# c++17_gcc: Build with C++17 options (links with stdlibc++) | ||
# c++1z_gcc: Build with C++17 options (links with stdlibc++) | ||
# | ||
# Other build options: | ||
# short_logs: Only log errors during build, skip warnings. | ||
# verbose_logs: Show all compiler warnings during build. | ||
# libc++: Link against libc++ instead of stdlibc++ | ||
|
||
# Suppress all warning messages. | ||
build:short_logs --output_filter=DONT_MATCH_ANYTHING | ||
build:verbose_logs --output_filter= | ||
build --config=short_logs | ||
|
||
# Allow builds using libc++ as a linker library. This is mostly for | ||
# OSSFuzz, so we also pass in the flags from environment to clean | ||
# build file. | ||
build:libc++ --action_env=CC | ||
build:libc++ --action_env=CXX | ||
build:libc++ --action_env=CXXFLAGS=-stdlib=libc++ | ||
build:libc++ --action_env=PATH | ||
build:libc++ --define force_libcpp=enabled | ||
build:libc++ --linkopt -fuse-ld=lld | ||
|
||
# Build with C++ 17 features. | ||
build:c++17 --cxxopt=-std=c++1z | ||
build:c++17 --cxxopt=-stdlib=libc++ | ||
build:c++1z --config=c++17 | ||
build:c++17_gcc --cxxopt=-std=c++1z | ||
build:c++1z_gcc --config=c++17_gcc | ||
|
||
# Enable using platform specific build settings, except when cross-compiling for | ||
# mobile platforms. | ||
build --enable_platform_specific_config | ||
build:android --noenable_platform_specific_config | ||
build:ios --noenable_platform_specific_config | ||
|
||
# By default, build in C++ 17 mode. | ||
build:android --cxxopt=-std=c++17 | ||
build:android --host_cxxopt=-std=c++17 | ||
build:ios --cxxopt=-std=c++17 | ||
build:ios --host_cxxopt=-std=c++17 | ||
build:linux --cxxopt=-std=c++17 | ||
build:linux --host_cxxopt=-std=c++17 | ||
build:macos --cxxopt=-std=c++17 | ||
build:macos --host_cxxopt=-std=c++17 | ||
build:windows --cxxopt=/std:c++17 | ||
build:windows --host_cxxopt=/std:c++17 | ||
|
||
# Fix for thread_identity issues in absl:: on macOS. | ||
# See: https://github.com/abseil/abseil-cpp/issues/848 | ||
build:macos --features=-supports_dynamic_linker |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
# Bazel build file for entropic tools. | ||
|
||
licenses(["notice"]) | ||
|
||
py_binary( | ||
name = "split_corpus_main", | ||
srcs = ["split_corpus_main.py"], | ||
python_version = "PY3", | ||
srcs_version = "PY3", | ||
deps = [ | ||
"@io_abseil_py//absl:app", | ||
"@io_abseil_py//absl/flags", | ||
"@io_abseil_py//absl/logging", | ||
], | ||
) | ||
|
||
py_binary( | ||
name = "entropy_difference_main", | ||
srcs = ["entropy_difference_main.py"], | ||
python_version = "PY3", | ||
srcs_version = "PY3", | ||
deps = [ | ||
"@io_abseil_py//absl:app", | ||
"@io_abseil_py//absl/flags", | ||
], | ||
) | ||
|
||
# Aliases for OpenFst and OpenGrm N-Gram tools. | ||
|
||
alias( | ||
name = "farcompilestrings", | ||
actual = "@org_openfst//:farcompilestrings", | ||
) | ||
|
||
alias( | ||
name = "ngramsymbols", | ||
actual = "@org_opengrm_ngram//:ngramsymbols", | ||
) | ||
|
||
alias( | ||
name = "ngramcount", | ||
actual = "@org_opengrm_ngram//:ngramcount", | ||
) | ||
|
||
alias( | ||
name = "ngrammake", | ||
actual = "@org_opengrm_ngram//:ngrammake", | ||
) | ||
|
||
alias( | ||
name = "ngramperplexity", | ||
actual = "@org_opengrm_ngram//:ngramperplexity", | ||
) | ||
|
||
alias( | ||
name = "ngramprint", | ||
actual = "@org_opengrm_ngram//:ngramprint", | ||
) | ||
|
||
cc_binary( | ||
name = "ngramcrossentropy", | ||
srcs = ["ngramcrossentropy-main.cc"], | ||
deps = [ | ||
"@io_abseil_cpp//absl/memory", | ||
"@io_abseil_cpp//absl/strings", | ||
"@org_opengrm_ngram//:opengrm-ngram-lib", | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Information-theoretic measures based on n-gram entropy. The code in this | ||
directory relies on [OpenFst](http://www.openfst.org/) and | ||
[OpenGrm N-Gram](http://www.opengrm.org/) libraries built using the | ||
[Bazel](https://bazel.build/) build system. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# Bazel (http://bazel.io/) workspace file for the entropic measures. | ||
|
||
workspace(name = "com_google_entropic") | ||
|
||
load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") | ||
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") | ||
|
||
# ------------------------------------------------------------------------- | ||
# Six is a Python 2 and 3 compatibility library: | ||
# ------------------------------------------------------------------------- | ||
|
||
http_archive( | ||
name = "six_archive", | ||
build_file = "@//bazel:six.BUILD.bazel", | ||
sha256 = "70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", | ||
strip_prefix = "six-1.11.0", | ||
urls = ["https://pypi.python.org/packages/16/d8/bc6316cf98419719bd59c91742194c111b6f2e85abac88e496adefaf7afe/six-1.11.0.tar.gz#md5=d12789f9baf7e9fb#524c0c64f1773f8"], | ||
) | ||
|
||
# ------------------------------------------------------------------------- | ||
# Google Abseil - C++ and Python Common Libraries: | ||
# ------------------------------------------------------------------------- | ||
|
||
http_archive( | ||
name = "com_google_absl", | ||
strip_prefix = "abseil-cpp-master", | ||
urls = ["https://github.com/abseil/abseil-cpp/archive/master.zip"], | ||
) | ||
|
||
git_repository( | ||
name = "io_abseil_cpp", | ||
commit = "078b89b3c046d230ef3ad39494e5852184eb528b", # 24th October, 2019. | ||
remote = "https://github.com/abseil/abseil-cpp.git", | ||
) | ||
|
||
http_archive( | ||
name = "io_abseil_py", | ||
strip_prefix = "abseil-py-master", | ||
urls = ["https://github.com/abseil/abseil-py/archive/master.zip"], | ||
) | ||
|
||
# ------------------------------------------------------------------------- | ||
# OpenFst: See | ||
# http://www.openfst.org/twiki/pub/FST/FstDownload/README | ||
# ------------------------------------------------------------------------- | ||
openfst_version = "1.8.2-rc1" | ||
|
||
http_archive( | ||
name = "org_openfst", | ||
urls = ["https://github.com/agutkin/finite_state/raw/main/openfst-%s.tar.gz" % openfst_version], | ||
sha256 = "0e86f73a7b4ebeadcb62af65479c352db9e0241a05317942767ec2670e58a6fb", | ||
strip_prefix = "openfst-%s" % openfst_version, | ||
) | ||
|
||
# ------------------------------------------------------------------------- | ||
# OpenGrm N-Gram: See | ||
# http://www.openfst.org/twiki/bin/view/GRM/NGramLibrary | ||
# ------------------------------------------------------------------------- | ||
opengrm_ngram_version = "1.3.13-rc1" | ||
|
||
http_archive( | ||
name = "org_opengrm_ngram", | ||
urls = ["https://github.com/agutkin/finite_state/raw/main/ngram-%s.tar.gz" % opengrm_ngram_version], | ||
sha256 = "c027cee208090f35a1f725dc9cc22bc0d977adba346d765bf2e1f55990a4fa40", | ||
strip_prefix = "ngram-%s" % opengrm_ngram_version, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
package(default_visibility = ["//visibility:public"]) | ||
|
||
licenses(["notice"]) | ||
|
||
exports_files(glob(["*.BUILD.bazel"])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Six is a Python 2 and 3 compatibility library. | ||
|
||
py_library( | ||
name = "six", | ||
srcs = ["six.py"], | ||
srcs_version = "PY2AND3", | ||
visibility = ["//visibility:public"], | ||
) | ||
|
||
# Local Variables: | ||
# mode: python | ||
# End: |
148 changes: 148 additions & 0 deletions
148
homophonous_logography/entropic/compute_relative_entropy.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
# Copyright 2021 The Google Research Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
#!/bin/bash | ||
|
||
# Obtain estimates of various information-theoretic measures based on entropy | ||
# of n-grams. The first argument specifies the original training data in the | ||
# format required for training the neural measures. This is a tab-separated text | ||
# file. The second argument is the language name. The temporary files are | ||
# created in the `/tmp` directory. | ||
# | ||
# Bazel build system should be installed locally for this tool to work. | ||
# | ||
# Examples: | ||
# -------- | ||
# ./compute_relative_entropy.sh ${DATA_DIR}/korean-jamo.tsv Korean | ||
# ./compute_relative_entropy.sh ${DATA_DIR}/japanese.tsv Japanese | ||
|
||
set -euo pipefail | ||
|
||
# N-gram order. | ||
ORDER=2 | ||
|
||
# Language name. | ||
LANGUAGE="$2" | ||
|
||
# Check for Bazel installation. | ||
which bazel > /dev/null 2>&1 | ||
if [ $? -ne 0 ] ; then | ||
echo "Please install Bazel to run this tool!" | ||
exit 1 | ||
fi | ||
|
||
# Split the corpus into training/test written/pronounced/joint. | ||
bazel build -c opt :split_corpus_main | ||
bazel-bin/split_corpus_main experimental/nlp/sweet/logo/entropy/split_corpus.py --corpus="$1" | ||
|
||
# Extract symbol table. | ||
bazel build -c opt :ngramsymbols | ||
NGRAM_SYMBOLS_TOOL=bazel-bin/external/org_opengrm_ngram/ngramsymbols | ||
cat /tmp/wtrain.txt /tmp/wtest.txt > /tmp/xxx | ||
${NGRAM_SYMBOLS_TOOL} \ | ||
/tmp/xxx /tmp/written.syms | ||
cat /tmp/ptrain.txt /tmp/ptest.txt > /tmp/xxx | ||
${NGRAM_SYMBOLS_TOOL} \ | ||
/tmp/xxx /tmp/phoneme.syms | ||
cat /tmp/jtrain.txt /tmp/jtest.txt > /tmp/xxx | ||
${NGRAM_SYMBOLS_TOOL} \ | ||
/tmp/xxx /tmp/joint.syms | ||
|
||
# Compile FST archives (FARs). | ||
bazel build -c opt :farcompilestrings | ||
FAR_COMPILE_STRINGS_TOOL=bazel-bin/external/org_openfst/farcompilestrings | ||
${FAR_COMPILE_STRINGS_TOOL} \ | ||
--fst_type=compact \ | ||
--symbols=/tmp/written.syms \ | ||
--keep_symbols \ | ||
/tmp/wtrain.txt /tmp/wtrain.far | ||
${FAR_COMPILE_STRINGS_TOOL} \ | ||
--fst_type=compact \ | ||
--symbols=/tmp/written.syms \ | ||
--keep_symbols \ | ||
/tmp/wtest.txt /tmp/wtest.far | ||
${FAR_COMPILE_STRINGS_TOOL} \ | ||
--fst_type=compact \ | ||
--symbols=/tmp/phoneme.syms \ | ||
--keep_symbols \ | ||
/tmp/ptrain.txt /tmp/ptrain.far | ||
${FAR_COMPILE_STRINGS_TOOL} \ | ||
--fst_type=compact \ | ||
--symbols=/tmp/phoneme.syms \ | ||
--keep_symbols \ | ||
/tmp/ptest.txt /tmp/ptest.far | ||
${FAR_COMPILE_STRINGS_TOOL} \ | ||
--fst_type=compact \ | ||
--symbols=/tmp/joint.syms \ | ||
--keep_symbols \ | ||
/tmp/jtrain.txt /tmp/jtrain.far | ||
|
||
# Accumulate n-gram counts. | ||
bazel build -c opt :ngramcount | ||
NGRAM_COUNT_TOOL=bazel-bin/external/org_opengrm_ngram/ngramcount | ||
${NGRAM_COUNT_TOOL} \ | ||
--order="${ORDER}" \ | ||
/tmp/wtrain.far /tmp/wtrain.cnts | ||
${NGRAM_COUNT_TOOL} \ | ||
--order="${ORDER}" \ | ||
/tmp/ptrain.far /tmp/ptrain.cnts | ||
${NGRAM_COUNT_TOOL} \ | ||
--order="${ORDER}" \ | ||
/tmp/jtrain.far /tmp/jtrain.cnts | ||
|
||
# Build n-gram models. | ||
bazel build -c opt :ngrammake | ||
NGRAM_MAKE_TOOL=bazel-bin/external/org_opengrm_ngram/ngrammake | ||
${NGRAM_MAKE_TOOL} \ | ||
/tmp/wtrain.cnts /tmp/wtrain.mod | ||
${NGRAM_MAKE_TOOL} \ | ||
/tmp/ptrain.cnts /tmp/ptrain.mod | ||
${NGRAM_MAKE_TOOL} \ | ||
/tmp/jtrain.cnts /tmp/jtrain.mod | ||
|
||
# Compute perplexities. | ||
bazel build -c opt :ngramperplexity | ||
NGRAM_PERPLEXITY_TOOL=bazel-bin/external/org_opengrm_ngram/ngramperplexity | ||
${NGRAM_PERPLEXITY_TOOL} \ | ||
/tmp/wtrain.mod /tmp/wtest.far /tmp/wtest.perp | ||
${NGRAM_PERPLEXITY_TOOL} \ | ||
/tmp/ptrain.mod /tmp/ptest.far /tmp/ptest.perp | ||
|
||
# Compute entropy difference/ratio. | ||
bazel build -c opt :entropy_difference_main | ||
bazel-bin/entropy_difference_main \ | ||
--corpus=$1 \ | ||
--wperp=/tmp/wtest.perp \ | ||
--pperp=/tmp/ptest.perp | ||
|
||
# Print models. | ||
bazel build -c opt :ngramprint | ||
NGRAM_PRINT_TOOL=bazel-bin/external/org_opengrm_ngram/ngramprint | ||
${NGRAM_PRINT_TOOL} \ | ||
/tmp/ptrain.mod /tmp/ptrain.mod.txt | ||
${NGRAM_PRINT_TOOL} \ | ||
/tmp/wtrain.mod /tmp/wtrain.mod.txt | ||
${NGRAM_PRINT_TOOL} \ | ||
/tmp/jtrain.mod /tmp/jtrain.mod.txt | ||
|
||
# Compute (cross-)entropies/KL divergences and mutual information (MI) measures. | ||
# bazel build -c opt :ngramcrossentropy | ||
bazel build -c opt :ngramcrossentropy | ||
bazel-bin/ngramcrossentropy \ | ||
--info_header="["${LANGUAGE}"]:" \ | ||
--ngram_joint_fst=/tmp/jtrain.mod \ | ||
--ngram_source_fst=/tmp/ptrain.mod \ | ||
--ngram_destination_fst=/tmp/wtrain.mod \ | ||
--source_samples_far=/tmp/ptest.far \ | ||
--destination_samples_far=/tmp/wtest.far |
Oops, something went wrong.