Merge pull request #39 from a-luna:changes-20240325

Refactor and update dependencies
a-luna · Mar 25, 2024 · c6de23e · c6de23e
2 parents 02affd1 + b9b4537
commit c6de23e
Show file tree

Hide file tree

Showing 23 changed files with 945 additions and 380 deletions.
diff --git a/README.md b/README.md
@@ -183,7 +183,7 @@
 	</details>
 	<br />
 	<p><span class="alert">⚠️</span> <strong><i>NOTE: Specifying <code>show_props=Minimum</code> in any request is redundent since the <strong>Minimum</strong> property group is included in all responses.</i></strong></p>
-	<p>If you wish to explore the properties of one or more specifc characters, the <code>/v1/characters/-/{string}</code> and <code>/v1/characters/filter</code> endpoints accept one or more <code>show_props</code> parameters that allow you to specify additional property groups to include in the response.</p><p>For example, you could view the properties from groups <strong>UTF-8</strong>, <strong>Numeric</strong>, and <strong>Script</strong> for the character Ⱒ (<code>U+2C22 <span>GLAGOLITIC CAPITAL LETTER SPIDERY HA</span></code>), which is equal to <code>0xE2 0xB0 0xA2</code> in UTF-8 encoding by submitting the following request: <a href="http://localhost:3507/v1/characters/%E2%B0%A2?show_props=UTF8&show_props=Numeric&show_props=Script" rel="noopener noreferrer" target="_blank">/v1/characters/%E2%B0%A2?show_props=UTF8&show_props=Numeric&show_props=Script</a>.</p>
+	<p>If you wish to explore the properties of one or more specifc characters, the <code>/v1/characters/-/{string}</code> and <code>/v1/characters/filter</code> endpoints accept one or more <code>show_props</code> parameters that allow you to specify additional property groups to include in the response.</p><p>For example, you could view the properties from groups <strong>UTF-8</strong>, <strong>Numeric</strong>, and <strong>Script</strong> for the character Ⱒ (<code>U+2C22 <span>GLAGOLITIC CAPITAL LETTER SPIDERY HA</span></code>), which is equal to <code>0xE2 0xB0 0xA2</code> in UTF-8 encoding by submitting the following request: <a href="http://localhost:3507/v1/characters/-/%E2%B0%A2?show_props=UTF8&show_props=Numeric&show_props=Script" rel="noopener noreferrer" target="_blank">/v1/characters/-/%E2%B0%A2?show_props=UTF8&show_props=Numeric&show_props=Script</a>.</p>
 	<h4 id="verbosity">Verbosity</h4>
 	<p>The value of many of the properties that are defined for each character are only meaningful for specific blocks or a small subset of codepoints (e.g., the <code>hangul_syllable_type</code> property will have a <code>(Not Applicable) NA</code> value for all codepoints except those in the four blocks that contain characters from the Hangul writing system).</p><p>By default, the <code>hangul_syllable_type</code> property will <strong>NOT</strong> be included with the response for any character that has this default value even if the user has submitted a request containing <code>show_props=hangul</code> or <code>show_props=all</code>. For actual Hangul characters, the property will be included in the response.</p><p>These properties are removed to make the size of each response as small as possible. Knowing that the 🇺 (<code>U+1F1FA <span>REGIONAL INDICATOR SYMBOL LETTER U</span></code>) character has the value <code>hangul_syllable_type=NA</code> provides no real information about this character.</p><p>However, if you wish to see every property value, include <code>verbose=true</code> with your request to the <code>/v1/characters/-/{string}</code> or <code>/v1/characters/filter</code> endpoints.</p>
 	<details class='property-group'>

diff --git a/app/config/api_settings.py b/app/config/api_settings.py
@@ -86,6 +86,8 @@ class UnicodeApiSettings:
     PLANES_JSON: Path = field(init=False)
     BLOCKS_JSON: Path = field(init=False)
     CHAR_NAME_MAP: Path = field(init=False)
+    UNIHAN_CHARS_JSON: Path = field(init=False)
+    TANGUT_CHARS_JSON: Path = field(init=False)
     JSON_ZIP_FILE: Path = field(init=False)
     JSON_ZIP_URL: str = field(init=False, default="")
     CSV_FOLDER: Path = field(init=False)
@@ -122,6 +124,8 @@ def __post_init__(self) -> None:
         self.PLANES_JSON = json_folder.joinpath("planes.json")
         self.BLOCKS_JSON = json_folder.joinpath("blocks.json")
         self.CHAR_NAME_MAP = json_folder.joinpath("char_name_map.json")
+        self.UNIHAN_CHARS_JSON = json_folder.joinpath("unihan_chars.json")
+        self.TANGUT_CHARS_JSON = json_folder.joinpath("tangut_chars.json")
         self.JSON_ZIP_FILE = json_folder.joinpath(JSON_ZIP_FILE_NAME)
         self.JSON_ZIP_URL = f"{HTTP_BUCKET_URL}/{self.UNICODE_VERSION}/{JSON_ZIP_FILE_NAME}"
         self.CSV_FOLDER = csv_folder
@@ -171,7 +175,19 @@ def get_non_unihan_character_name_map(self) -> dict[int, str]:
         json_map = json.loads(self.CHAR_NAME_MAP.read_text())
         return {int(codepoint): name for (codepoint, name) in json_map.items()}
 
-    def init_data_folders(self) -> None:  # pragma: no cover
+    def get_unihan_character_name_map(self) -> set[int]:
+        if not self.UNIHAN_CHARS_JSON.exists():  # pragma: no cover
+            return set()
+        json_map = json.loads(self.UNIHAN_CHARS_JSON.read_text())
+        return {int(codepoint): int(block_id) for (codepoint, block_id) in json_map.items()}
+
+    def get_tangut_character_name_map(self) -> set[int]:
+        if not self.TANGUT_CHARS_JSON.exists():  # pragma: no cover
+            return set()
+        json_map = json.loads(self.TANGUT_CHARS_JSON.read_text())
+        return {int(codepoint): int(block_id) for (codepoint, block_id) in json_map.items()}
+
+    def init_data_folders(self) -> None:  # pragma: no cover  # noqa: C901
         self.DB_FOLDER.mkdir(parents=True, exist_ok=True)
         if self.DB_FILE.exists():
             self.DB_FILE.unlink()
@@ -184,6 +200,10 @@ def init_data_folders(self) -> None:  # pragma: no cover
             self.BLOCKS_JSON.unlink()
         if self.CHAR_NAME_MAP.exists():
             self.CHAR_NAME_MAP.unlink()
+        if self.UNIHAN_CHARS_JSON.exists():
+            self.UNIHAN_CHARS_JSON.unlink()
+        if self.TANGUT_CHARS_JSON.exists():
+            self.TANGUT_CHARS_JSON.unlink()
 
         if self.is_dev or self.is_test:
             self.CSV_FOLDER.mkdir(parents=True, exist_ok=True)

diff --git a/app/core/rate_limit.py b/app/core/rate_limit.py
@@ -103,7 +103,7 @@ def is_exceeded(self, request: Request) -> Result[None]:
     def apply_rate_limit_to_request(self, request: Request):
         if self.settings.is_test:
             return enable_rate_limit_feature_for_test(request)
-        return request_origin_is_external(request) and requested_route_is_rate_limited(request)
+        return request_origin_is_external(request) and requested_route_is_rate_limited(request)  # pragma: no cover
 
     def get_allowed_at(self, tat: float) -> float:
         return (dtaware_fromtimestamp(tat) - self.delay_tolerance_ms).timestamp()
@@ -145,15 +145,15 @@ def enable_rate_limit_feature_for_test(request: Request) -> bool:
     return False  # pragma: no cover
 
 
-def request_origin_is_external(request: Request) -> bool:
+def request_origin_is_external(request: Request) -> bool:  # pragma: no cover
     if request.client.host in ["localhost", "127.0.0.1", "testserver"]:
         return False
     if "sec-fetch-site" in request.headers:
         return request.headers["sec-fetch-site"] != "same-site"
     return True
 
 
-def requested_route_is_rate_limited(request: Request):
+def requested_route_is_rate_limited(request: Request):  # pragma: no cover
     return RATE_LIMIT_ROUTE_REGEX.search(request.url.path)
 
 

diff --git a/app/core/redis_client.py b/app/core/redis_client.py
@@ -165,4 +165,4 @@ def now(self) -> datetime:
         return dtaware_fromtimestamp(self.time())
 
 
-redis = RedisClient() if "TEST" not in os.environ.get("ENV", "DEV") else TestRedisClient()
+redis = TestRedisClient() if get_settings().is_test else RedisClient()
diff --git a/app/data/cache.py b/app/data/cache.py
@@ -15,9 +15,7 @@
     NULL_BLOCK,
     NULL_PLANE,
 )
-from app.schemas.enums import UnassignedCharacterType
-
-CHAR_TABLES = [db.UnicodeCharacter, db.UnicodeCharacterUnihan]
+from app.schemas.enums import CharacterType
 
 
 class UnicodeDataCache:
@@ -72,9 +70,13 @@ def cjk_compatibility_block_ids(self) -> set[int]:
         return {b.id for b in self.blocks if "cjk compatibility ideographs" in b.name.lower() and b.id}
 
     @property
-    def tangut_character_block_ids(self) -> set[int]:
+    def tangut_ideograph_block_ids(self) -> set[int]:
         return {b.id for b in self.blocks if "tangut" in b.name.lower() and "component" not in b.name.lower() and b.id}
 
+    @property
+    def tangut_component_block_ids(self) -> set[int]:
+        return {b.id for b in self.blocks if "tangut components" in b.name.lower() and b.id}
+
     @property
     def surrogate_block_ids(self) -> set[int]:
         return {b.id for b in self.blocks if "surrogate" in b.name.lower() and b.id}
@@ -85,11 +87,7 @@ def private_use_block_ids(self) -> set[int]:
             b.id for b in self.blocks if "private use" in b.name.lower() and "surrogate" not in b.name.lower() and b.id
         }
 
-    @property
-    def all_cjk_ideograph_block_ids(self) -> set[int]:
-        return set(list(self.cjk_unified_ideograph_block_ids) + list(self.cjk_compatibility_block_ids))
-
-    @property
+    @cached_property
     def planes(self) -> list[db.UnicodePlane]:
         return self.settings.get_unicode_planes_data()
 
@@ -129,61 +127,62 @@ def all_control_character_codepoints(self) -> set[int]:
     def all_noncharacter_codepoints(self) -> set[int]:
         return set(NON_CHARACTER_CODEPOINTS)
 
-    @property
+    @cached_property
     def all_non_unihan_codepoints(self) -> set[int]:
-        return set(self.non_unihan_character_name_map.keys())
+        return set(self.settings.get_non_unihan_character_name_map().keys())
 
-    @property
-    def all_cjk_ideograph_codepoints(self):
-        cjk_blocks = [self.get_unicode_block_by_id(block_id) for block_id in sorted(self.all_cjk_ideograph_block_ids)]
-        cjk_codepoints = [list(range(b.start_dec, b.finish_dec + 1)) for b in cjk_blocks]
-        return set(itertools.chain(*cjk_codepoints)) - self.all_noncharacter_codepoints
+    @cached_property
+    def all_cjk_codepoints(self) -> set[int]:
+        return set(self.settings.get_unihan_character_name_map().keys())
+
+    @cached_property
+    def all_tangut_ideograph_codepoints(self) -> set[int]:
+        return {
+            cp
+            for cp, block_id in self.settings.get_tangut_character_name_map().items()
+            if block_id in self.tangut_ideograph_block_ids
+        }
+
+    @cached_property
+    def all_tangut_component_codepoints(self) -> set[int]:
+        return {
+            cp
+            for cp, block_id in self.settings.get_tangut_character_name_map().items()
+            if block_id in self.tangut_component_block_ids
+        }
 
     @property
-    def all_tangut_codepoints(self):
-        tangut_blocks = [self.get_unicode_block_by_id(block_id) for block_id in self.tangut_character_block_ids]
-        tangut_codepoints = [list(range(b.start_dec, b.finish_dec + 1)) for b in tangut_blocks]
-        return set(itertools.chain(*tangut_codepoints)) - self.all_noncharacter_codepoints
+    def all_tangut_codepoints(self) -> set[int]:
+        return self.all_tangut_ideograph_codepoints | self.all_tangut_component_codepoints
 
     @property
     def all_surrogate_codepoints(self) -> set[int]:
-        su_blocks = [self.get_unicode_block_by_id(block_id) for block_id in self.surrogate_block_ids]
-        su_codepoints = [list(range(b.start_dec, b.finish_dec + 1)) for b in su_blocks]
-        return set(itertools.chain(*su_codepoints)) - self.all_noncharacter_codepoints
+        return self.get_all_codepoints_in_block_id_list(self.surrogate_block_ids)
 
     @property
     def all_private_use_codepoints(self) -> set[int]:
-        pu_blocks = [self.get_unicode_block_by_id(block_id) for block_id in self.private_use_block_ids]
-        pu_codepoints = [list(range(b.start_dec, b.finish_dec + 1)) for b in pu_blocks]
-        return set(itertools.chain(*pu_codepoints)) - self.all_noncharacter_codepoints
+        return self.get_all_codepoints_in_block_id_list(self.private_use_block_ids)
 
     @property
     def all_assigned_codepoints(self) -> set[int]:
         return set(
             list(self.all_non_unihan_codepoints)
-            + list(self.all_cjk_ideograph_codepoints)
+            + list(self.all_cjk_codepoints)
             + list(self.all_tangut_codepoints)
             + list(self.all_surrogate_codepoints)
             + list(self.all_private_use_codepoints)
         )
 
-    @property
-    def all_reserved_codepoints(self) -> set[int]:
-        return (
-            self.all_codepoints_in_unicode_space
-            - self.all_assigned_codepoints
-            - self.all_noncharacter_codepoints
-            - self.all_surrogate_codepoints
-            - self.all_private_use_codepoints
-        )
-
     @property
     def official_number_of_unicode_characters(self) -> int:
         # The "official" number of characters listed for each version of Unicode is the total number
         # of graphic and format characters (i.e., excluding private-use characters, control characters,
         # noncharacters and surrogate code points).
         # source: https://en.wikipedia.org/wiki/Unicode#cite_ref-25
-        return sum(plane.total_defined for plane in self.planes) - len(self.all_control_character_codepoints)
+        total_defined = (
+            len(self.all_non_unihan_codepoints) + len(self.all_cjk_codepoints) + len(self.all_tangut_codepoints)
+        )
+        return total_defined - len(self.all_control_character_codepoints)
 
     @property
     def unicode_version(self) -> str:
@@ -240,6 +239,9 @@ def get_unicode_plane_containing_block_id(self, block_id: int) -> db.UnicodePlan
         found = [p for p in self.planes if p.start_block_id <= block_id and block_id <= p.finish_block_id]
         return found[0] if found else db.UnicodePlane(**NULL_PLANE)
 
+    def codepoint_is_in_unicode_space(self, codepoint: int) -> bool:
+        return codepoint in self.all_codepoints_in_unicode_space
+
     def codepoint_is_assigned(self, codepoint: int) -> bool:
         return codepoint in self.all_assigned_codepoints
 
@@ -252,29 +254,46 @@ def codepoint_is_surrogate(self, codepoint: int) -> bool:
     def codepoint_is_private_use(self, codepoint: int) -> bool:
         return codepoint in self.all_private_use_codepoints
 
-    def codepoint_is_reserved(self, codepoint: int) -> bool:
-        return codepoint in self.all_reserved_codepoints
-
     def codepoint_is_ascii_control_character(self, codepoint: int) -> bool:
         return codepoint in C0_CONTROL_CHARACTERS
 
     def character_is_non_unihan(self, codepoint: int) -> bool:
         return codepoint in self.non_unihan_character_name_map
 
     def character_is_unihan(self, codepoint: int) -> bool:
-        return codepoint in self.all_cjk_ideograph_codepoints
+        return codepoint in self.all_cjk_codepoints
 
     def character_is_tangut(self, codepoint: int) -> bool:
         return codepoint in self.all_tangut_codepoints
 
     @cache
     def get_character_name(self, codepoint: int) -> str:
+        char_type = self.get_character_type(codepoint)
+        match char_type:
+            case CharacterType.NON_UNIHAN:
+                return self.get_name_for_non_unihan_character(codepoint)
+            case CharacterType.UNIHAN | CharacterType.TANGUT:
+                return self.get_generic_name_for_codepoint(codepoint)
+            case _:
+                return self.get_label_for_unnamed_codepoint(codepoint, char_type)
+
+    def get_character_type(self, codepoint: int) -> CharacterType:
         return (
-            self.get_name_for_non_unihan_character(codepoint)
+            CharacterType.NON_UNIHAN
             if self.character_is_non_unihan(codepoint)
-            else self.get_generic_name_for_codepoint(codepoint)
-            if self.character_is_unihan(codepoint) or self.character_is_tangut(codepoint)
-            else self.get_label_for_unassigned_codepoint(codepoint)
+            else CharacterType.UNIHAN
+            if self.character_is_unihan(codepoint)
+            else CharacterType.TANGUT
+            if self.character_is_tangut(codepoint)
+            else CharacterType.NONCHARACTER
+            if self.codepoint_is_noncharacter(codepoint)
+            else CharacterType.SURROGATE
+            if self.codepoint_is_surrogate(codepoint)
+            else CharacterType.PRIVATE_USE
+            if self.codepoint_is_private_use(codepoint)
+            else CharacterType.RESERVED
+            if self.codepoint_is_in_unicode_space(codepoint)
+            else CharacterType.INVALID
         )
 
     def get_name_for_non_unihan_character(self, codepoint: int) -> str:
@@ -288,27 +307,23 @@ def get_generic_name_for_codepoint(self, codepoint: int) -> str:
             else f"CJK COMPATIBILITY IDEOGRAPH-{codepoint:04X}"
             if block.id in self.cjk_compatibility_block_ids
             else f"TANGUT IDEOGRAPH-{codepoint:04X}"
-            if block.id in self.tangut_character_block_ids
+            if block.id in self.tangut_ideograph_block_ids
+            else f"TANGUT COMPONENT-{self.get_tangut_component_index(codepoint):03}"
+            if block.id in self.tangut_component_block_ids
             else ""
         )
 
-    def get_label_for_unassigned_codepoint(self, codepoint: int) -> str:
-        if (char_type := self.get_unassigned_character_type(codepoint)) != UnassignedCharacterType.INVALID:
-            return f"<{char_type}-{codepoint:04X}>"
-        return f"Invalid Codepoint (U+{codepoint:04X})"
+    def get_tangut_component_index(self, codepoint: int) -> int:
+        tangut_components_block = self.get_unicode_block_by_id(list(self.tangut_component_block_ids)[0])
+        # The Tangut component characters are one-indexed
+        return (codepoint - tangut_components_block.start_dec) + 1
 
-    def get_unassigned_character_type(self, codepoint: int) -> UnassignedCharacterType:
-        return (
-            UnassignedCharacterType.NONCHARACTER
-            if self.codepoint_is_noncharacter(codepoint)
-            else UnassignedCharacterType.SURROGATE
-            if self.codepoint_is_surrogate(codepoint)
-            else UnassignedCharacterType.PRIVATE_USE
-            if self.codepoint_is_private_use(codepoint)
-            else UnassignedCharacterType.RESERVED
-            if self.codepoint_is_reserved(codepoint)
-            else UnassignedCharacterType.INVALID
-        )
+    def get_label_for_unnamed_codepoint(self, codepoint: int, char_type: CharacterType) -> str:
+        match char_type:
+            case CharacterType.INVALID:
+                return f"Invalid Codepoint (U+{codepoint:04X})"
+            case _:
+                return f"<{char_type}-{codepoint:04X}>"
 
     def get_mapped_codepoint_from_hex(self, codepoint_hex: str) -> str:  # pragma: no cover
         if not codepoint_hex:
@@ -328,5 +343,9 @@ def get_mapped_codepoint_from_int(self, codepoint_dec: int) -> str:  # pragma: n
             else ""
         )
 
+    def get_all_codepoints_in_block_id_list(self, block_id_list: list[int]) -> set[int]:
+        blocks = [self.get_unicode_block_by_id(block_id) for block_id in block_id_list]
+        return set(itertools.chain(*[list(range(block.start_dec, block.finish_dec + 1)) for block in blocks]))
+
 
 cached_data = UnicodeDataCache()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -165,4 +165,4 @@ def now(self) -> datetime:
		return dtaware_fromtimestamp(self.time())


		redis = RedisClient() if "TEST" not in os.environ.get("ENV", "DEV") else TestRedisClient()
		redis = TestRedisClient() if get_settings().is_test else RedisClient()