Skip to content

Commit

Permalink
New combined data struct for IANA name mapping (#4718)
Browse files Browse the repository at this point in the history
Split from #4548
  • Loading branch information
sffc committed Mar 23, 2024
1 parent 580c55a commit db791b5
Show file tree
Hide file tree
Showing 9 changed files with 1,193 additions and 8 deletions.
51 changes: 48 additions & 3 deletions components/timezone/src/provider/names.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
use crate::TimeZoneBcp47Id;
use core::str;
use icu_provider::prelude::*;
use zerotrie::ZeroTrie;
use zerotrie::{ZeroAsciiIgnoreCaseTrie, ZeroTrie};
use zerovec::{VarZeroVec, ZeroVec};

/// A mapping from IANA time zone identifiers to BCP-47 time zone identifiers.
/// A mapping from lowercase IANA time zone identifiers to BCP-47 time zone identifiers.
///
/// Multiple IANA time zone IDs can map to the same BCP-47 time zone ID.
///
Expand All @@ -42,11 +42,56 @@ use zerovec::{VarZeroVec, ZeroVec};
#[yoke(prove_covariance_manually)]
pub struct IanaToBcp47MapV1<'data> {
/// A map from IANA time zone identifiers to indexes of BCP-47 time zone identifiers.
/// The IANA identifiers are lowercase.
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: ZeroTrie<ZeroVec<'data, u8>>,
/// A sorted list of BCP-47 time zone identifiers.
#[cfg_attr(feature = "serde", serde(borrow))]
// Note: this is 9739B as ZeroVec<TinyStr8> and 9335B as VarZeroVec<str>
// Note: this is 9739B as `ZeroVec<TimeZoneBcp47Id>` (`ZeroVec<TinyStr8>`)
// and 9335B as `VarZeroVec<str>`
pub bcp47_ids: ZeroVec<'data, TimeZoneBcp47Id>,
/// An XxHash64 checksum of [`Self::bcp47_ids`].
pub bcp47_ids_checksum: u64,
}

/// A mapping from normal-case IANA time zone identifiers to BCP-47 time zone identifiers.
///
/// Multiple IANA time zone IDs can map to the same BCP-47 time zone ID.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Clone, PartialEq)]
#[icu_provider::data_struct(marker(
IanaToBcp47MapV2Marker,
"time_zone/iana_to_bcp47@2",
singleton
))]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_timezone::provider::names),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct IanaToBcp47MapV2<'data> {
/// A map from normal-case IANA time zone identifiers to indexes of BCP-47 time zone
/// identifiers along with a canonical flag. The IANA identifiers are normal-case.
///
/// The `usize` values stored in the trie have the following form:
///
/// - Lowest bit: 1 if canonical, 0 if not canonical
/// - All remaining bits: index into `bcp47_ids`
///
/// For example, in CLDR 44, `"Africa/Abidjan"` has value 221, which means it is canonical
/// (low bit is 1 == odd number) and the index into `bcp47_ids` is 110 (221 >> 1).
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: ZeroAsciiIgnoreCaseTrie<ZeroVec<'data, u8>>,
/// A sorted list of BCP-47 time zone identifiers.
#[cfg_attr(feature = "serde", serde(borrow))]
// Note: this is 9739B as `ZeroVec<TimeZoneBcp47Id>` (`ZeroVec<TinyStr8>`)
// and 9335B as `VarZeroVec<str>`
pub bcp47_ids: ZeroVec<'data, TimeZoneBcp47Id>,
/// An XxHash64 checksum of [`Self::bcp47_ids`].
pub bcp47_ids_checksum: u64,
Expand Down
1 change: 1 addition & 0 deletions provider/datagen/src/registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,7 @@ registry!(
icu_timezone::provider::MetazonePeriodV1Marker = "time_zone/metazone_period@1",
icu_timezone::provider::names::Bcp47ToIanaMapV1Marker = "time_zone/bcp47_to_iana@1",
icu_timezone::provider::names::IanaToBcp47MapV1Marker = "time_zone/iana_to_bcp47@1",
icu_timezone::provider::names::IanaToBcp47MapV2Marker = "time_zone/iana_to_bcp47@2",
#[cfg(feature = "experimental_components")]
icu_experimental::transliterate::provider::TransliteratorRulesV1Marker =
"transliterator/rules@1",
Expand Down
60 changes: 55 additions & 5 deletions provider/datagen/src/transform/cldr/time_zones/names.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use icu_timezone::TimeZoneBcp47Id;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::hash::Hasher;
use zerotrie::ZeroTriePerfectHash;
use zerotrie::{ZeroAsciiIgnoreCaseTrie, ZeroTriePerfectHash};
use zerovec::{ZeroSlice, ZeroVec};

impl DataProvider<IanaToBcp47MapV1Marker> for crate::DatagenProvider {
Expand All @@ -38,10 +38,6 @@ impl DataProvider<IanaToBcp47MapV1Marker> for crate::DatagenProvider {
})
.collect();

// TODO(#4031): Use this ignore-case trie in the data. For now, check that it builds.
zerotrie::ZeroAsciiIgnoreCaseTrie::try_from(&map)
.expect("Data should work in case-insensitive trie");

let data_struct = IanaToBcp47MapV1 {
map: ZeroTriePerfectHash::try_from(&map)
.map_err(|e| {
Expand All @@ -66,6 +62,60 @@ impl IterableDataProvider<IanaToBcp47MapV1Marker> for crate::DatagenProvider {
}
}

impl DataProvider<IanaToBcp47MapV2Marker> for crate::DatagenProvider {
fn load(&self, _: DataRequest) -> Result<DataResponse<IanaToBcp47MapV2Marker>, DataError> {
let resource: &cldr_serde::time_zones::bcp47_tzid::Resource =
self.cldr()?.bcp47().read_and_parse("timezone.json")?;

let iana2bcp = &compute_bcp47_tzids_btreemap(&resource.keyword.u.time_zones.values);

// Sort and deduplicate the BCP-47 IDs:
let bcp_set: BTreeSet<TimeZoneBcp47Id> = iana2bcp.values().copied().collect();
let bcp47_ids: ZeroVec<TimeZoneBcp47Id> = bcp_set.iter().copied().collect();
let bcp47_ids_checksum = compute_bcp47_ids_hash(&bcp47_ids);

// Get the canonical IANA names.
// Note: The BTreeMap retains the order of the aliases, which is important for establishing
// the canonical order of the IANA names.
let bcp2iana = compute_canonical_tzids_btreemap(&resource.keyword.u.time_zones.values);

// Transform the map to use BCP indices:
#[allow(clippy::unwrap_used)] // structures are derived from each other
let map: BTreeMap<Vec<u8>, usize> = iana2bcp
.iter()
.map(|(iana, bcp)| {
let is_canonical = bcp2iana.get(bcp) == Some(iana);
let index = bcp47_ids.binary_search(bcp).unwrap();
(
iana.as_bytes().to_vec(),
(index << 1) | (is_canonical as usize),
)
})
.collect();

let data_struct = IanaToBcp47MapV2 {
map: ZeroAsciiIgnoreCaseTrie::try_from(&map)
.map_err(|e| {
DataError::custom("Could not create ZeroTrie from timezone.json data")
.with_display_context(&e)
})?
.convert_store(),
bcp47_ids,
bcp47_ids_checksum,
};
Ok(DataResponse {
metadata: Default::default(),
payload: Some(DataPayload::from_owned(data_struct)),
})
}
}

impl IterableDataProvider<IanaToBcp47MapV2Marker> for crate::DatagenProvider {
fn supported_locales(&self) -> Result<Vec<DataLocale>, DataError> {
Ok(vec![Default::default()])
}
}

impl DataProvider<Bcp47ToIanaMapV1Marker> for crate::DatagenProvider {
fn load(&self, _: DataRequest) -> Result<DataResponse<Bcp47ToIanaMapV1Marker>, DataError> {
let resource: &cldr_serde::time_zones::bcp47_tzid::Resource =
Expand Down
5 changes: 5 additions & 0 deletions provider/datagen/tests/data/baked/macros.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions provider/datagen/tests/data/baked/mod.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit db791b5

Please sign in to comment.