Skip to content

Commit

Permalink
Merge pull request #39 from a-luna:changes-20240325
Browse files Browse the repository at this point in the history
Refactor and update dependencies
  • Loading branch information
a-luna authored Mar 25, 2024
2 parents 02affd1 + b9b4537 commit c6de23e
Show file tree
Hide file tree
Showing 23 changed files with 945 additions and 380 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</details>
<br />
<p><span class="alert">⚠️</span> <strong><i>NOTE: Specifying <code>show_props=Minimum</code> in any request is redundent since the <strong>Minimum</strong> property group is included in all responses.</i></strong></p>
<p>If you wish to explore the properties of one or more specifc characters, the <code>/v1/characters/-/{string}</code> and <code>/v1/characters/filter</code> endpoints accept one or more <code>show_props</code> parameters that allow you to specify additional property groups to include in the response.</p><p>For example, you could view the properties from groups <strong>UTF-8</strong>, <strong>Numeric</strong>, and <strong>Script</strong> for the character Ⱒ (<code>U+2C22 <span>GLAGOLITIC CAPITAL LETTER SPIDERY HA</span></code>), which is equal to <code>0xE2 0xB0 0xA2</code> in UTF-8 encoding by submitting the following request: <a href="http://localhost:3507/v1/characters/%E2%B0%A2?show_props=UTF8&show_props=Numeric&show_props=Script" rel="noopener noreferrer" target="_blank">/v1/characters/%E2%B0%A2?show_props=UTF8&show_props=Numeric&show_props=Script</a>.</p>
<p>If you wish to explore the properties of one or more specifc characters, the <code>/v1/characters/-/{string}</code> and <code>/v1/characters/filter</code> endpoints accept one or more <code>show_props</code> parameters that allow you to specify additional property groups to include in the response.</p><p>For example, you could view the properties from groups <strong>UTF-8</strong>, <strong>Numeric</strong>, and <strong>Script</strong> for the character Ⱒ (<code>U+2C22 <span>GLAGOLITIC CAPITAL LETTER SPIDERY HA</span></code>), which is equal to <code>0xE2 0xB0 0xA2</code> in UTF-8 encoding by submitting the following request: <a href="http://localhost:3507/v1/characters/-/%E2%B0%A2?show_props=UTF8&show_props=Numeric&show_props=Script" rel="noopener noreferrer" target="_blank">/v1/characters/-/%E2%B0%A2?show_props=UTF8&show_props=Numeric&show_props=Script</a>.</p>
<h4 id="verbosity">Verbosity</h4>
<p>The value of many of the properties that are defined for each character are only meaningful for specific blocks or a small subset of codepoints (e.g., the <code>hangul_syllable_type</code> property will have a <code>(Not Applicable) NA</code> value for all codepoints except those in the four blocks that contain characters from the Hangul writing system).</p><p>By default, the <code>hangul_syllable_type</code> property will <strong>NOT</strong> be included with the response for any character that has this default value even if the user has submitted a request containing <code>show_props=hangul</code> or <code>show_props=all</code>. For actual Hangul characters, the property will be included in the response.</p><p>These properties are removed to make the size of each response as small as possible. Knowing that the 🇺 (<code>U+1F1FA <span>REGIONAL INDICATOR SYMBOL LETTER U</span></code>) character has the value <code>hangul_syllable_type=NA</code> provides no real information about this character.</p><p>However, if you wish to see every property value, include <code>verbose=true</code> with your request to the <code>/v1/characters/-/{string}</code> or <code>/v1/characters/filter</code> endpoints.</p>
<details class='property-group'>
Expand Down
22 changes: 21 additions & 1 deletion app/config/api_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ class UnicodeApiSettings:
PLANES_JSON: Path = field(init=False)
BLOCKS_JSON: Path = field(init=False)
CHAR_NAME_MAP: Path = field(init=False)
UNIHAN_CHARS_JSON: Path = field(init=False)
TANGUT_CHARS_JSON: Path = field(init=False)
JSON_ZIP_FILE: Path = field(init=False)
JSON_ZIP_URL: str = field(init=False, default="")
CSV_FOLDER: Path = field(init=False)
Expand Down Expand Up @@ -122,6 +124,8 @@ def __post_init__(self) -> None:
self.PLANES_JSON = json_folder.joinpath("planes.json")
self.BLOCKS_JSON = json_folder.joinpath("blocks.json")
self.CHAR_NAME_MAP = json_folder.joinpath("char_name_map.json")
self.UNIHAN_CHARS_JSON = json_folder.joinpath("unihan_chars.json")
self.TANGUT_CHARS_JSON = json_folder.joinpath("tangut_chars.json")
self.JSON_ZIP_FILE = json_folder.joinpath(JSON_ZIP_FILE_NAME)
self.JSON_ZIP_URL = f"{HTTP_BUCKET_URL}/{self.UNICODE_VERSION}/{JSON_ZIP_FILE_NAME}"
self.CSV_FOLDER = csv_folder
Expand Down Expand Up @@ -171,7 +175,19 @@ def get_non_unihan_character_name_map(self) -> dict[int, str]:
json_map = json.loads(self.CHAR_NAME_MAP.read_text())
return {int(codepoint): name for (codepoint, name) in json_map.items()}

def init_data_folders(self) -> None: # pragma: no cover
def get_unihan_character_name_map(self) -> set[int]:
if not self.UNIHAN_CHARS_JSON.exists(): # pragma: no cover
return set()
json_map = json.loads(self.UNIHAN_CHARS_JSON.read_text())
return {int(codepoint): int(block_id) for (codepoint, block_id) in json_map.items()}

def get_tangut_character_name_map(self) -> set[int]:
if not self.TANGUT_CHARS_JSON.exists(): # pragma: no cover
return set()
json_map = json.loads(self.TANGUT_CHARS_JSON.read_text())
return {int(codepoint): int(block_id) for (codepoint, block_id) in json_map.items()}

def init_data_folders(self) -> None: # pragma: no cover # noqa: C901
self.DB_FOLDER.mkdir(parents=True, exist_ok=True)
if self.DB_FILE.exists():
self.DB_FILE.unlink()
Expand All @@ -184,6 +200,10 @@ def init_data_folders(self) -> None: # pragma: no cover
self.BLOCKS_JSON.unlink()
if self.CHAR_NAME_MAP.exists():
self.CHAR_NAME_MAP.unlink()
if self.UNIHAN_CHARS_JSON.exists():
self.UNIHAN_CHARS_JSON.unlink()
if self.TANGUT_CHARS_JSON.exists():
self.TANGUT_CHARS_JSON.unlink()

if self.is_dev or self.is_test:
self.CSV_FOLDER.mkdir(parents=True, exist_ok=True)
Expand Down
6 changes: 3 additions & 3 deletions app/core/rate_limit.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def is_exceeded(self, request: Request) -> Result[None]:
def apply_rate_limit_to_request(self, request: Request):
if self.settings.is_test:
return enable_rate_limit_feature_for_test(request)
return request_origin_is_external(request) and requested_route_is_rate_limited(request)
return request_origin_is_external(request) and requested_route_is_rate_limited(request) # pragma: no cover

def get_allowed_at(self, tat: float) -> float:
return (dtaware_fromtimestamp(tat) - self.delay_tolerance_ms).timestamp()
Expand Down Expand Up @@ -145,15 +145,15 @@ def enable_rate_limit_feature_for_test(request: Request) -> bool:
return False # pragma: no cover


def request_origin_is_external(request: Request) -> bool:
def request_origin_is_external(request: Request) -> bool: # pragma: no cover
if request.client.host in ["localhost", "127.0.0.1", "testserver"]:
return False
if "sec-fetch-site" in request.headers:
return request.headers["sec-fetch-site"] != "same-site"
return True


def requested_route_is_rate_limited(request: Request):
def requested_route_is_rate_limited(request: Request): # pragma: no cover
return RATE_LIMIT_ROUTE_REGEX.search(request.url.path)


Expand Down
2 changes: 1 addition & 1 deletion app/core/redis_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,4 @@ def now(self) -> datetime:
return dtaware_fromtimestamp(self.time())


redis = RedisClient() if "TEST" not in os.environ.get("ENV", "DEV") else TestRedisClient()
redis = TestRedisClient() if get_settings().is_test else RedisClient()
145 changes: 82 additions & 63 deletions app/data/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@
NULL_BLOCK,
NULL_PLANE,
)
from app.schemas.enums import UnassignedCharacterType

CHAR_TABLES = [db.UnicodeCharacter, db.UnicodeCharacterUnihan]
from app.schemas.enums import CharacterType


class UnicodeDataCache:
Expand Down Expand Up @@ -72,9 +70,13 @@ def cjk_compatibility_block_ids(self) -> set[int]:
return {b.id for b in self.blocks if "cjk compatibility ideographs" in b.name.lower() and b.id}

@property
def tangut_character_block_ids(self) -> set[int]:
def tangut_ideograph_block_ids(self) -> set[int]:
return {b.id for b in self.blocks if "tangut" in b.name.lower() and "component" not in b.name.lower() and b.id}

@property
def tangut_component_block_ids(self) -> set[int]:
return {b.id for b in self.blocks if "tangut components" in b.name.lower() and b.id}

@property
def surrogate_block_ids(self) -> set[int]:
return {b.id for b in self.blocks if "surrogate" in b.name.lower() and b.id}
Expand All @@ -85,11 +87,7 @@ def private_use_block_ids(self) -> set[int]:
b.id for b in self.blocks if "private use" in b.name.lower() and "surrogate" not in b.name.lower() and b.id
}

@property
def all_cjk_ideograph_block_ids(self) -> set[int]:
return set(list(self.cjk_unified_ideograph_block_ids) + list(self.cjk_compatibility_block_ids))

@property
@cached_property
def planes(self) -> list[db.UnicodePlane]:
return self.settings.get_unicode_planes_data()

Expand Down Expand Up @@ -129,61 +127,62 @@ def all_control_character_codepoints(self) -> set[int]:
def all_noncharacter_codepoints(self) -> set[int]:
return set(NON_CHARACTER_CODEPOINTS)

@property
@cached_property
def all_non_unihan_codepoints(self) -> set[int]:
return set(self.non_unihan_character_name_map.keys())
return set(self.settings.get_non_unihan_character_name_map().keys())

@property
def all_cjk_ideograph_codepoints(self):
cjk_blocks = [self.get_unicode_block_by_id(block_id) for block_id in sorted(self.all_cjk_ideograph_block_ids)]
cjk_codepoints = [list(range(b.start_dec, b.finish_dec + 1)) for b in cjk_blocks]
return set(itertools.chain(*cjk_codepoints)) - self.all_noncharacter_codepoints
@cached_property
def all_cjk_codepoints(self) -> set[int]:
return set(self.settings.get_unihan_character_name_map().keys())

@cached_property
def all_tangut_ideograph_codepoints(self) -> set[int]:
return {
cp
for cp, block_id in self.settings.get_tangut_character_name_map().items()
if block_id in self.tangut_ideograph_block_ids
}

@cached_property
def all_tangut_component_codepoints(self) -> set[int]:
return {
cp
for cp, block_id in self.settings.get_tangut_character_name_map().items()
if block_id in self.tangut_component_block_ids
}

@property
def all_tangut_codepoints(self):
tangut_blocks = [self.get_unicode_block_by_id(block_id) for block_id in self.tangut_character_block_ids]
tangut_codepoints = [list(range(b.start_dec, b.finish_dec + 1)) for b in tangut_blocks]
return set(itertools.chain(*tangut_codepoints)) - self.all_noncharacter_codepoints
def all_tangut_codepoints(self) -> set[int]:
return self.all_tangut_ideograph_codepoints | self.all_tangut_component_codepoints

@property
def all_surrogate_codepoints(self) -> set[int]:
su_blocks = [self.get_unicode_block_by_id(block_id) for block_id in self.surrogate_block_ids]
su_codepoints = [list(range(b.start_dec, b.finish_dec + 1)) for b in su_blocks]
return set(itertools.chain(*su_codepoints)) - self.all_noncharacter_codepoints
return self.get_all_codepoints_in_block_id_list(self.surrogate_block_ids)

@property
def all_private_use_codepoints(self) -> set[int]:
pu_blocks = [self.get_unicode_block_by_id(block_id) for block_id in self.private_use_block_ids]
pu_codepoints = [list(range(b.start_dec, b.finish_dec + 1)) for b in pu_blocks]
return set(itertools.chain(*pu_codepoints)) - self.all_noncharacter_codepoints
return self.get_all_codepoints_in_block_id_list(self.private_use_block_ids)

@property
def all_assigned_codepoints(self) -> set[int]:
return set(
list(self.all_non_unihan_codepoints)
+ list(self.all_cjk_ideograph_codepoints)
+ list(self.all_cjk_codepoints)
+ list(self.all_tangut_codepoints)
+ list(self.all_surrogate_codepoints)
+ list(self.all_private_use_codepoints)
)

@property
def all_reserved_codepoints(self) -> set[int]:
return (
self.all_codepoints_in_unicode_space
- self.all_assigned_codepoints
- self.all_noncharacter_codepoints
- self.all_surrogate_codepoints
- self.all_private_use_codepoints
)

@property
def official_number_of_unicode_characters(self) -> int:
# The "official" number of characters listed for each version of Unicode is the total number
# of graphic and format characters (i.e., excluding private-use characters, control characters,
# noncharacters and surrogate code points).
# source: https://en.wikipedia.org/wiki/Unicode#cite_ref-25
return sum(plane.total_defined for plane in self.planes) - len(self.all_control_character_codepoints)
total_defined = (
len(self.all_non_unihan_codepoints) + len(self.all_cjk_codepoints) + len(self.all_tangut_codepoints)
)
return total_defined - len(self.all_control_character_codepoints)

@property
def unicode_version(self) -> str:
Expand Down Expand Up @@ -240,6 +239,9 @@ def get_unicode_plane_containing_block_id(self, block_id: int) -> db.UnicodePlan
found = [p for p in self.planes if p.start_block_id <= block_id and block_id <= p.finish_block_id]
return found[0] if found else db.UnicodePlane(**NULL_PLANE)

def codepoint_is_in_unicode_space(self, codepoint: int) -> bool:
return codepoint in self.all_codepoints_in_unicode_space

def codepoint_is_assigned(self, codepoint: int) -> bool:
return codepoint in self.all_assigned_codepoints

Expand All @@ -252,29 +254,46 @@ def codepoint_is_surrogate(self, codepoint: int) -> bool:
def codepoint_is_private_use(self, codepoint: int) -> bool:
return codepoint in self.all_private_use_codepoints

def codepoint_is_reserved(self, codepoint: int) -> bool:
return codepoint in self.all_reserved_codepoints

def codepoint_is_ascii_control_character(self, codepoint: int) -> bool:
return codepoint in C0_CONTROL_CHARACTERS

def character_is_non_unihan(self, codepoint: int) -> bool:
return codepoint in self.non_unihan_character_name_map

def character_is_unihan(self, codepoint: int) -> bool:
return codepoint in self.all_cjk_ideograph_codepoints
return codepoint in self.all_cjk_codepoints

def character_is_tangut(self, codepoint: int) -> bool:
return codepoint in self.all_tangut_codepoints

@cache
def get_character_name(self, codepoint: int) -> str:
char_type = self.get_character_type(codepoint)
match char_type:
case CharacterType.NON_UNIHAN:
return self.get_name_for_non_unihan_character(codepoint)
case CharacterType.UNIHAN | CharacterType.TANGUT:
return self.get_generic_name_for_codepoint(codepoint)
case _:
return self.get_label_for_unnamed_codepoint(codepoint, char_type)

def get_character_type(self, codepoint: int) -> CharacterType:
return (
self.get_name_for_non_unihan_character(codepoint)
CharacterType.NON_UNIHAN
if self.character_is_non_unihan(codepoint)
else self.get_generic_name_for_codepoint(codepoint)
if self.character_is_unihan(codepoint) or self.character_is_tangut(codepoint)
else self.get_label_for_unassigned_codepoint(codepoint)
else CharacterType.UNIHAN
if self.character_is_unihan(codepoint)
else CharacterType.TANGUT
if self.character_is_tangut(codepoint)
else CharacterType.NONCHARACTER
if self.codepoint_is_noncharacter(codepoint)
else CharacterType.SURROGATE
if self.codepoint_is_surrogate(codepoint)
else CharacterType.PRIVATE_USE
if self.codepoint_is_private_use(codepoint)
else CharacterType.RESERVED
if self.codepoint_is_in_unicode_space(codepoint)
else CharacterType.INVALID
)

def get_name_for_non_unihan_character(self, codepoint: int) -> str:
Expand All @@ -288,27 +307,23 @@ def get_generic_name_for_codepoint(self, codepoint: int) -> str:
else f"CJK COMPATIBILITY IDEOGRAPH-{codepoint:04X}"
if block.id in self.cjk_compatibility_block_ids
else f"TANGUT IDEOGRAPH-{codepoint:04X}"
if block.id in self.tangut_character_block_ids
if block.id in self.tangut_ideograph_block_ids
else f"TANGUT COMPONENT-{self.get_tangut_component_index(codepoint):03}"
if block.id in self.tangut_component_block_ids
else ""
)

def get_label_for_unassigned_codepoint(self, codepoint: int) -> str:
if (char_type := self.get_unassigned_character_type(codepoint)) != UnassignedCharacterType.INVALID:
return f"<{char_type}-{codepoint:04X}>"
return f"Invalid Codepoint (U+{codepoint:04X})"
def get_tangut_component_index(self, codepoint: int) -> int:
tangut_components_block = self.get_unicode_block_by_id(list(self.tangut_component_block_ids)[0])
# The Tangut component characters are one-indexed
return (codepoint - tangut_components_block.start_dec) + 1

def get_unassigned_character_type(self, codepoint: int) -> UnassignedCharacterType:
return (
UnassignedCharacterType.NONCHARACTER
if self.codepoint_is_noncharacter(codepoint)
else UnassignedCharacterType.SURROGATE
if self.codepoint_is_surrogate(codepoint)
else UnassignedCharacterType.PRIVATE_USE
if self.codepoint_is_private_use(codepoint)
else UnassignedCharacterType.RESERVED
if self.codepoint_is_reserved(codepoint)
else UnassignedCharacterType.INVALID
)
def get_label_for_unnamed_codepoint(self, codepoint: int, char_type: CharacterType) -> str:
match char_type:
case CharacterType.INVALID:
return f"Invalid Codepoint (U+{codepoint:04X})"
case _:
return f"<{char_type}-{codepoint:04X}>"

def get_mapped_codepoint_from_hex(self, codepoint_hex: str) -> str: # pragma: no cover
if not codepoint_hex:
Expand All @@ -328,5 +343,9 @@ def get_mapped_codepoint_from_int(self, codepoint_dec: int) -> str: # pragma: n
else ""
)

def get_all_codepoints_in_block_id_list(self, block_id_list: list[int]) -> set[int]:
blocks = [self.get_unicode_block_by_id(block_id) for block_id in block_id_list]
return set(itertools.chain(*[list(range(block.start_dec, block.finish_dec + 1)) for block in blocks]))


cached_data = UnicodeDataCache()
Loading

0 comments on commit c6de23e

Please sign in to comment.