Skip to content

Commit

Permalink
Initial import of the neural, lexical and entropic measures.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 380089862
  • Loading branch information
agutkin authored and copybara-github committed Jun 18, 2021
1 parent 65d81c4 commit f164714
Show file tree
Hide file tree
Showing 28 changed files with 9,919 additions and 0 deletions.
2 changes: 2 additions & 0 deletions homophonous_logography/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
This directory contains the supporting code for the respective paper currently
under review. Please check back for details.
59 changes: 59 additions & 0 deletions homophonous_logography/entropic/.bazelrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Bazel configuration file.
# -------------------------
# Based on TensorFlow options in:
# https://github.com/tensorflow/tensorflow/blob/master/.bazelrc
#
# Compiler options:
# c++17: Build with C++17 options (links with libc++)
# c++1z: Build with C++17 options (links with libc++)
# c++17_gcc: Build with C++17 options (links with stdlibc++)
# c++1z_gcc: Build with C++17 options (links with stdlibc++)
#
# Other build options:
# short_logs: Only log errors during build, skip warnings.
# verbose_logs: Show all compiler warnings during build.
# libc++: Link against libc++ instead of stdlibc++

# Suppress all warning messages.
build:short_logs --output_filter=DONT_MATCH_ANYTHING
build:verbose_logs --output_filter=
build --config=short_logs

# Allow builds using libc++ as a linker library. This is mostly for
# OSSFuzz, so we also pass in the flags from environment to clean
# build file.
build:libc++ --action_env=CC
build:libc++ --action_env=CXX
build:libc++ --action_env=CXXFLAGS=-stdlib=libc++
build:libc++ --action_env=PATH
build:libc++ --define force_libcpp=enabled
build:libc++ --linkopt -fuse-ld=lld

# Build with C++ 17 features.
build:c++17 --cxxopt=-std=c++1z
build:c++17 --cxxopt=-stdlib=libc++
build:c++1z --config=c++17
build:c++17_gcc --cxxopt=-std=c++1z
build:c++1z_gcc --config=c++17_gcc

# Enable using platform specific build settings, except when cross-compiling for
# mobile platforms.
build --enable_platform_specific_config
build:android --noenable_platform_specific_config
build:ios --noenable_platform_specific_config

# By default, build in C++ 17 mode.
build:android --cxxopt=-std=c++17
build:android --host_cxxopt=-std=c++17
build:ios --cxxopt=-std=c++17
build:ios --host_cxxopt=-std=c++17
build:linux --cxxopt=-std=c++17
build:linux --host_cxxopt=-std=c++17
build:macos --cxxopt=-std=c++17
build:macos --host_cxxopt=-std=c++17
build:windows --cxxopt=/std:c++17
build:windows --host_cxxopt=/std:c++17

# Fix for thread_identity issues in absl:: on macOS.
# See: https://github.com/abseil/abseil-cpp/issues/848
build:macos --features=-supports_dynamic_linker
68 changes: 68 additions & 0 deletions homophonous_logography/entropic/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Bazel build file for entropic tools.

licenses(["notice"])

py_binary(
name = "split_corpus_main",
srcs = ["split_corpus_main.py"],
python_version = "PY3",
srcs_version = "PY3",
deps = [
"@io_abseil_py//absl:app",
"@io_abseil_py//absl/flags",
"@io_abseil_py//absl/logging",
],
)

py_binary(
name = "entropy_difference_main",
srcs = ["entropy_difference_main.py"],
python_version = "PY3",
srcs_version = "PY3",
deps = [
"@io_abseil_py//absl:app",
"@io_abseil_py//absl/flags",
],
)

# Aliases for OpenFst and OpenGrm N-Gram tools.

alias(
name = "farcompilestrings",
actual = "@org_openfst//:farcompilestrings",
)

alias(
name = "ngramsymbols",
actual = "@org_opengrm_ngram//:ngramsymbols",
)

alias(
name = "ngramcount",
actual = "@org_opengrm_ngram//:ngramcount",
)

alias(
name = "ngrammake",
actual = "@org_opengrm_ngram//:ngrammake",
)

alias(
name = "ngramperplexity",
actual = "@org_opengrm_ngram//:ngramperplexity",
)

alias(
name = "ngramprint",
actual = "@org_opengrm_ngram//:ngramprint",
)

cc_binary(
name = "ngramcrossentropy",
srcs = ["ngramcrossentropy-main.cc"],
deps = [
"@io_abseil_cpp//absl/memory",
"@io_abseil_cpp//absl/strings",
"@org_opengrm_ngram//:opengrm-ngram-lib",
],
)
4 changes: 4 additions & 0 deletions homophonous_logography/entropic/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Information-theoretic measures based on n-gram entropy. The code in this
directory relies on [OpenFst](http://www.openfst.org/) and
[OpenGrm N-Gram](http://www.opengrm.org/) libraries built using the
[Bazel](https://bazel.build/) build system.
66 changes: 66 additions & 0 deletions homophonous_logography/entropic/WORKSPACE.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Bazel (http://bazel.io/) workspace file for the entropic measures.

workspace(name = "com_google_entropic")

load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

# -------------------------------------------------------------------------
# Six is a Python 2 and 3 compatibility library:
# -------------------------------------------------------------------------

http_archive(
name = "six_archive",
build_file = "@//bazel:six.BUILD.bazel",
sha256 = "70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
strip_prefix = "six-1.11.0",
urls = ["https://pypi.python.org/packages/16/d8/bc6316cf98419719bd59c91742194c111b6f2e85abac88e496adefaf7afe/six-1.11.0.tar.gz#md5=d12789f9baf7e9fb#524c0c64f1773f8"],
)

# -------------------------------------------------------------------------
# Google Abseil - C++ and Python Common Libraries:
# -------------------------------------------------------------------------

http_archive(
name = "com_google_absl",
strip_prefix = "abseil-cpp-master",
urls = ["https://github.com/abseil/abseil-cpp/archive/master.zip"],
)

git_repository(
name = "io_abseil_cpp",
commit = "078b89b3c046d230ef3ad39494e5852184eb528b", # 24th October, 2019.
remote = "https://github.com/abseil/abseil-cpp.git",
)

http_archive(
name = "io_abseil_py",
strip_prefix = "abseil-py-master",
urls = ["https://github.com/abseil/abseil-py/archive/master.zip"],
)

# -------------------------------------------------------------------------
# OpenFst: See
# http://www.openfst.org/twiki/pub/FST/FstDownload/README
# -------------------------------------------------------------------------
openfst_version = "1.8.2-rc1"

http_archive(
name = "org_openfst",
urls = ["https://github.com/agutkin/finite_state/raw/main/openfst-%s.tar.gz" % openfst_version],
sha256 = "0e86f73a7b4ebeadcb62af65479c352db9e0241a05317942767ec2670e58a6fb",
strip_prefix = "openfst-%s" % openfst_version,
)

# -------------------------------------------------------------------------
# OpenGrm N-Gram: See
# http://www.openfst.org/twiki/bin/view/GRM/NGramLibrary
# -------------------------------------------------------------------------
opengrm_ngram_version = "1.3.13-rc1"

http_archive(
name = "org_opengrm_ngram",
urls = ["https://github.com/agutkin/finite_state/raw/main/ngram-%s.tar.gz" % opengrm_ngram_version],
sha256 = "c027cee208090f35a1f725dc9cc22bc0d977adba346d765bf2e1f55990a4fa40",
strip_prefix = "ngram-%s" % opengrm_ngram_version,
)
5 changes: 5 additions & 0 deletions homophonous_logography/entropic/bazel/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package(default_visibility = ["//visibility:public"])

licenses(["notice"])

exports_files(glob(["*.BUILD.bazel"]))
12 changes: 12 additions & 0 deletions homophonous_logography/entropic/bazel/six.BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Six is a Python 2 and 3 compatibility library.

py_library(
name = "six",
srcs = ["six.py"],
srcs_version = "PY2AND3",
visibility = ["//visibility:public"],
)

# Local Variables:
# mode: python
# End:
148 changes: 148 additions & 0 deletions homophonous_logography/entropic/compute_relative_entropy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Copyright 2021 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#!/bin/bash

# Obtain estimates of various information-theoretic measures based on entropy
# of n-grams. The first argument specifies the original training data in the
# format required for training the neural measures. This is a tab-separated text
# file. The second argument is the language name. The temporary files are
# created in the `/tmp` directory.
#
# Bazel build system should be installed locally for this tool to work.
#
# Examples:
# --------
# ./compute_relative_entropy.sh ${DATA_DIR}/korean-jamo.tsv Korean
# ./compute_relative_entropy.sh ${DATA_DIR}/japanese.tsv Japanese

set -euo pipefail

# N-gram order.
ORDER=2

# Language name.
LANGUAGE="$2"

# Check for Bazel installation.
which bazel > /dev/null 2>&1
if [ $? -ne 0 ] ; then
echo "Please install Bazel to run this tool!"
exit 1
fi

# Split the corpus into training/test written/pronounced/joint.
bazel build -c opt :split_corpus_main
bazel-bin/split_corpus_main experimental/nlp/sweet/logo/entropy/split_corpus.py --corpus="$1"

# Extract symbol table.
bazel build -c opt :ngramsymbols
NGRAM_SYMBOLS_TOOL=bazel-bin/external/org_opengrm_ngram/ngramsymbols
cat /tmp/wtrain.txt /tmp/wtest.txt > /tmp/xxx
${NGRAM_SYMBOLS_TOOL} \
/tmp/xxx /tmp/written.syms
cat /tmp/ptrain.txt /tmp/ptest.txt > /tmp/xxx
${NGRAM_SYMBOLS_TOOL} \
/tmp/xxx /tmp/phoneme.syms
cat /tmp/jtrain.txt /tmp/jtest.txt > /tmp/xxx
${NGRAM_SYMBOLS_TOOL} \
/tmp/xxx /tmp/joint.syms

# Compile FST archives (FARs).
bazel build -c opt :farcompilestrings
FAR_COMPILE_STRINGS_TOOL=bazel-bin/external/org_openfst/farcompilestrings
${FAR_COMPILE_STRINGS_TOOL} \
--fst_type=compact \
--symbols=/tmp/written.syms \
--keep_symbols \
/tmp/wtrain.txt /tmp/wtrain.far
${FAR_COMPILE_STRINGS_TOOL} \
--fst_type=compact \
--symbols=/tmp/written.syms \
--keep_symbols \
/tmp/wtest.txt /tmp/wtest.far
${FAR_COMPILE_STRINGS_TOOL} \
--fst_type=compact \
--symbols=/tmp/phoneme.syms \
--keep_symbols \
/tmp/ptrain.txt /tmp/ptrain.far
${FAR_COMPILE_STRINGS_TOOL} \
--fst_type=compact \
--symbols=/tmp/phoneme.syms \
--keep_symbols \
/tmp/ptest.txt /tmp/ptest.far
${FAR_COMPILE_STRINGS_TOOL} \
--fst_type=compact \
--symbols=/tmp/joint.syms \
--keep_symbols \
/tmp/jtrain.txt /tmp/jtrain.far

# Accumulate n-gram counts.
bazel build -c opt :ngramcount
NGRAM_COUNT_TOOL=bazel-bin/external/org_opengrm_ngram/ngramcount
${NGRAM_COUNT_TOOL} \
--order="${ORDER}" \
/tmp/wtrain.far /tmp/wtrain.cnts
${NGRAM_COUNT_TOOL} \
--order="${ORDER}" \
/tmp/ptrain.far /tmp/ptrain.cnts
${NGRAM_COUNT_TOOL} \
--order="${ORDER}" \
/tmp/jtrain.far /tmp/jtrain.cnts

# Build n-gram models.
bazel build -c opt :ngrammake
NGRAM_MAKE_TOOL=bazel-bin/external/org_opengrm_ngram/ngrammake
${NGRAM_MAKE_TOOL} \
/tmp/wtrain.cnts /tmp/wtrain.mod
${NGRAM_MAKE_TOOL} \
/tmp/ptrain.cnts /tmp/ptrain.mod
${NGRAM_MAKE_TOOL} \
/tmp/jtrain.cnts /tmp/jtrain.mod

# Compute perplexities.
bazel build -c opt :ngramperplexity
NGRAM_PERPLEXITY_TOOL=bazel-bin/external/org_opengrm_ngram/ngramperplexity
${NGRAM_PERPLEXITY_TOOL} \
/tmp/wtrain.mod /tmp/wtest.far /tmp/wtest.perp
${NGRAM_PERPLEXITY_TOOL} \
/tmp/ptrain.mod /tmp/ptest.far /tmp/ptest.perp

# Compute entropy difference/ratio.
bazel build -c opt :entropy_difference_main
bazel-bin/entropy_difference_main \
--corpus=$1 \
--wperp=/tmp/wtest.perp \
--pperp=/tmp/ptest.perp

# Print models.
bazel build -c opt :ngramprint
NGRAM_PRINT_TOOL=bazel-bin/external/org_opengrm_ngram/ngramprint
${NGRAM_PRINT_TOOL} \
/tmp/ptrain.mod /tmp/ptrain.mod.txt
${NGRAM_PRINT_TOOL} \
/tmp/wtrain.mod /tmp/wtrain.mod.txt
${NGRAM_PRINT_TOOL} \
/tmp/jtrain.mod /tmp/jtrain.mod.txt

# Compute (cross-)entropies/KL divergences and mutual information (MI) measures.
# bazel build -c opt :ngramcrossentropy
bazel build -c opt :ngramcrossentropy
bazel-bin/ngramcrossentropy \
--info_header="["${LANGUAGE}"]:" \
--ngram_joint_fst=/tmp/jtrain.mod \
--ngram_source_fst=/tmp/ptrain.mod \
--ngram_destination_fst=/tmp/wtrain.mod \
--source_samples_far=/tmp/ptest.far \
--destination_samples_far=/tmp/wtest.far
Loading

0 comments on commit f164714

Please sign in to comment.