Skip to content

Commit

Permalink
Merge branch 'new-yaml-format' of github.com:alephdata/fingerprints i…
Browse files Browse the repository at this point in the history
…nto new-yaml-format
  • Loading branch information
pudo committed Jan 4, 2025
2 parents be392c9 + b23daa6 commit 5a446f0
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 4 deletions.
22 changes: 22 additions & 0 deletions fingerprints/data/types.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1613,6 +1613,10 @@ company_types:
- ПОУ
- ПРОФЕССИОНАЛЬНОЕ ОБРАЗОВАТЕЛЬНОЕ УЧРЕЖДЕНИЕ

- display: ГП
aliases:
- ГОСУДАРСТВЕННОЕ ПРЕДПРИЯТИЕ # STATE ENTERPRISE
- Государственное предприятие
- display: ГУП # государственное унитарное предприятие
compare: ГУП
aliases:
Expand Down Expand Up @@ -1734,12 +1738,20 @@ company_types:
aliases:
- ФГУ
- ФЕДЕРАЛЬНОЕ ГОСУДАРСТВЕННОЕ УЧРЕЖДЕНИЕ # FEDERAL STATE INSTITUTION

- display: ФГКУ # федеральное государственное казенное учреждение
aliases:
- ФЕДЕРАЛЬНОЕ ГОСУДАРСТВЕННОЕ КАЗЕННОЕ УЧРЕЖДЕНИЕ # FEDERAL STATE TREASURY INSTITUTION

- display: ФКУ # федеральное казенное учреждение
compare: ФКУ
aliases:
- ФКУ
- ФЕДЕРАЛЬНОЕ КАЗЕННОЕ УЧРЕЖДЕНИЕ # FEDERAL TREASURY INSTITUTION

- display: ФКП # федеральное казенное предприятие
aliases:
- ФЕДЕРАЛЬНОЕ КАЗЕННОЕ ПРЕДПРИЯТИЕ

- display: ФГБВОУ ВО # федеральное государственное бюджетное военное образовательное учреждение высшего образования
compare: ФГБВОУ ВО
Expand Down Expand Up @@ -1896,6 +1908,10 @@ company_types:
- ПОПРЗ
- ПЕРВИЧНАЯ ОРГАНИЗАЦИЯ ПРОФСОЮЗА РАБОТНИКОВ ЗДРАВООХРАНЕНИЯ # PRIMARY TRADE UNION ORGANIZATION OF HEALTHCARE WORKERS

- display: УПРАВЛЕНИЕ ФСБ РФ
aliases:
- УПРАВЛЕНИЕ ФЕДЕРАЛЬНОЙ СЛУЖБЫ БЕЗОПАСНОСТИ РОССИЙСКОЙ ФЕДЕРАЦИИ # FEDERAL SECURITY SERVICE OF THE RUSSIAN FEDERATION

- display: ОГКОУ # областное государственное казенное образовательное учреждение
aliases:
- ОГКОУ
Expand Down Expand Up @@ -1937,6 +1953,10 @@ company_types:
- НЕГОСУДАРСТВЕННОЕ ОБРАЗОВАТЕЛЬНОЕ УЧРЕЖДЕНИЕ НАЧАЛЬНОГО И ДОПОЛНИТЕЛЬНОГО ПРОФЕССИОНАЛЬНОГО ОБРАЗОВАНИЯ # NON-STATE EDUCATIONAL INSTITUTION OF INITIAL AND FURTHER PROFESSIONAL EDUCATION
- НЕГОСУДАРСТВЕНОЕ ОБРАЗОВАТЕЛЬНОЕ УЧРЕЖДЕНИЕ ДОПОЛНИТЕЛЬНОГО ПРОФЕССИОНАЛЬНОГО ОБРАЗОВАНИЯ (ПОВЫШЕНИЯ КВАЛИФИКАЦИИ) СПЕЦИАЛИСТОВ # NON-STATE EDUCATIONAL INSTITUTION OF FURTHER PROFESSIONAL EDUCATION (QUALIFICATION IMPROVEMENT) FOR SPECIALISTS

- display: САК
aliases:
- СТРАХОВАЯ АКЦИОНЕРНАЯ КОМПАНИЯ # INSURANCE JOINT-STOCK COMPANY

- display: ЗАО # закрытое акционерное общество
compare: CJSC
aliases:
Expand Down Expand Up @@ -1985,7 +2005,9 @@ company_types:
- АКЦИОНЕРНОЕ ОБЩЕСТВО # JOINT-STOCK COMPANY
- АКЦИОНЕРНЫЕ ОБЩЕСТВА
- Акционерное общество
- Aкционерное общество # different ''A'' character
- акционерная компания с ограниченной ответственностью
- АКЦИОНЕРНОГО ОБЩЕСТВА
- display: AO
compare: JSC
aliases:
Expand Down
47 changes: 47 additions & 0 deletions tools/common_patterns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os
from collections import defaultdict
from typing import List, Tuple


def common_partial_phrases(
phrases: List[str], min_length: int, max_length: int
) -> List[Tuple[str, int]]:
partial_phrases = defaultdict(int)

for phrase in phrases:
words = phrase.split()
n = len(words)
for length in range(min_length, max_length + 1):
for start in range(n - length + 1):
partial_phrase = " ".join(words[start : start + length])
partial_phrases[partial_phrase] += 1

sorted_phrases = sorted(
partial_phrases.items(), key=lambda x: (len(x[0].split()), x[1]), reverse=True
)
return sorted_phrases


def process_file(file_path: str, min_length: int, max_length: int) -> None:
with open(file_path, "r", encoding="utf-8") as f:
lines = [line.strip() for line in f if line.strip()]

result = common_partial_phrases(lines, min_length, max_length)

# Filter phrases that appear at least 10 times and sort in descending order
filtered_result = [
(phrase, frequency) for phrase, frequency in result if frequency >= 200
]

for phrase, frequency in filtered_result:
print(f"{phrase}: {frequency}")


file_name = "tools/output.csv"
min_length = 4
max_length = 7

if os.path.exists(file_name):
process_file(file_name, min_length, max_length)
else:
print(f"File '{file_name}' not found in the current directory.")
15 changes: 11 additions & 4 deletions tools/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@
output_file_path, mode="w", newline="", encoding="utf-8"
) as output_file:
fieldnames = [
"entity_id",
"prop",
"prop_type",
"value",
"original_value",
"lang",
] # Define the output CSV column headers
"original_value",
"schema",
]
writer = csv.DictWriter(output_file, fieldnames=fieldnames)

# Write header to the output CSV file
Expand All @@ -31,12 +35,15 @@
"Company",
"Organization",
]:
# Write the filtered row to the output CSV file
writer.writerow(
{
"entity_id": row["entity_id"],
"prop": row["prop"],
"prop_type": row["prop_type"],
"value": row["value"],
"original_value": row["original_value"],
"lang": row["lang"],
"original_value": row["original_value"],
"schema": row["schema"],
}
)

Expand Down

0 comments on commit 5a446f0

Please sign in to comment.