From 3a8983aafbbf4ea412486bb0e1c058a6852eb575 Mon Sep 17 00:00:00 2001 From: Northword Date: Fri, 4 Oct 2024 20:55:40 +0800 Subject: [PATCH] Refactor combine_journal_lists scripts to improve quality ref: #150 --- scripts/combine_journal_lists_dotless.py | 97 +++++++++++++++---- scripts/combine_journal_lists_dots.py | 117 +++++++++++++++++------ 2 files changed, 163 insertions(+), 51 deletions(-) diff --git a/scripts/combine_journal_lists_dotless.py b/scripts/combine_journal_lists_dotless.py index 11e5ffa..3988d22 100755 --- a/scripts/combine_journal_lists_dotless.py +++ b/scripts/combine_journal_lists_dotless.py @@ -3,7 +3,7 @@ """ Python script for combining several journal abbreviation lists and producing an alphabetically sorted list. If the same journal -names are repeated, only the version found last is retained. +names are repeated, only the version found first is retained. This version of the script specifically combines the lists following the ISO4 standard WITHOUT dots after abbreviated words. @@ -13,37 +13,92 @@ Output: writes file 'journalList_dotless.csv' """ +import csv +import json +from pathlib import Path +import re import sys -import pandas as pd # Define the list of CSV files import_order = [ - 'journals/journal_abbreviations_entrez.csv', - 'journals/journal_abbreviations_medicus.csv', - 'journals/journal_abbreviations_webofscience-dotless.csv' + "journals/journal_abbreviations_entrez.csv", + "journals/journal_abbreviations_medicus.csv", + "journals/journal_abbreviations_webofscience-dotless.csv", ] -def main(output_filename): - # Read and merge CSV files - # dfs = [pd.read_csv(file, header=None) for file in import_order] - dfs = [] - for file in import_order: - df = pd.read_csv(file, header=None) - dfs.append(df) - print(f"{file}: {len(df)}") - merged_df = pd.concat(dfs, ignore_index=True) +def load_data(file_paths): + """Load and combine data from CSV files.""" + journal_dict = {} + normalized_keys = set() + for path in file_paths: + with open(path, mode="r", encoding="utf-8") as file: + reader = csv.reader(file) + for row in reader: + name = row[0].strip() + abbr = row[1].strip() - # Drop duplicates based on the first column value and keep the last one obtained - merged_df.drop_duplicates(subset=[0], keep='last', inplace=True) + # Discard entries where name or abbr is missing + if not (name and abbr): + continue + # Discard entries that are too long or too short + if len(name) >= 80 or len(name) <= 3: + continue + # Discard names that start with non-alphanumeric characters + if not name[0].isalnum(): + continue + # Discard names that consist only of numbers + if name.replace(" ", "").isnumeric(): + continue + # Discard names containing \ + if name.count("\\"): + continue + # Discard entries where the first letters of name and abbr do not match + if abbr[0] != name.replace("The", "").replace("A ", "")[0]: + continue + # Only keep the first occurrence + if name in journal_dict: + continue + # Generate normalizedKey, keeping only the first match + normalized_key = normalize_name(name) + if normalized_key in normalized_keys: + continue - # Sort alphabetically - sorted_df = merged_df.sort_values(by=[0]) + journal_dict[name] = abbr + normalized_keys.add(normalized_key) # Add to the set of used keys + return journal_dict - # Save the result to the specified CSV file and ensure values are quoted - sorted_df.to_csv(output_filename, index=False, header=False, quoting=1) - print(f"Write {output_filename}, Combined key count: {len(merged_df)}") +def normalize_name(name): + """ + Normalize the journal name by removing specified characters using regex. + See src/utils/str.ts -> normalizeKey() + """ + return re.sub(r"\b(the|and)\b|[&\-:, ()]", "", name, flags=re.IGNORECASE).lower() + + +def save_to_json(data, output_file): + """Save the data to a JSON file.""" + with open(output_file, mode="w", encoding="utf-8") as json_file: + json.dump(data, json_file, indent=2, ensure_ascii=False) + + +def save_to_csv(data, output_file): + """Save the data to a CSV file.""" + with open(output_file, mode="w", newline="", encoding="utf-8") as csv_file: + writer = csv.writer(csv_file, quoting=1) + for name, abbr in data.items(): + writer.writerow([name, abbr]) + + +def main(filename): + base_path = Path().cwd() + output_filename = base_path / filename + import_paths = [base_path / file for file in import_order] + + journal_data = load_data(import_paths) + sorted_journal_data = dict(sorted(journal_data.items())) # Sort alphabetically + save_to_csv(sorted_journal_data, output_filename) if __name__ == "__main__": diff --git a/scripts/combine_journal_lists_dots.py b/scripts/combine_journal_lists_dots.py index 00873d2..ecf5bcd 100755 --- a/scripts/combine_journal_lists_dots.py +++ b/scripts/combine_journal_lists_dots.py @@ -3,7 +3,7 @@ """ Python script for combining several journal abbreviation lists and producing an alphabetically sorted list. If the same journal -names are repeated, only the version found last is retained. +names are repeated, only the version found first is retained. This version of the script specifically combines the lists following the ISO4 standard WITH dots after abbreviated words. @@ -13,45 +13,102 @@ Output: writes file 'journalList_dots.csv' (or specified output file) """ +import csv +import json +from pathlib import Path +import re import sys -import pandas as pd -# Define the list of CSV files import_order = [ - 'journals/journal_abbreviations_acs.csv', - 'journals/journal_abbreviations_ams.csv', - 'journals/journal_abbreviations_general.csv', - 'journals/journal_abbreviations_geology_physics.csv', - 'journals/journal_abbreviations_ieee.csv', - 'journals/journal_abbreviations_lifescience.csv', - 'journals/journal_abbreviations_mathematics.csv', - 'journals/journal_abbreviations_mechanical.csv', - 'journals/journal_abbreviations_meteorology.csv', - 'journals/journal_abbreviations_sociology.csv', - 'journals/journal_abbreviations_webofscience-dots.csv' + # Keep IEEE before ubc, because IEEE has its own style. + "journals/journal_abbreviations_ieee.csv", + "journals/journal_abbreviations_acs.csv", + # Keep ubc before other jabref's, because ubc's data is more accurate. + "journals/journal_abbreviations_ubc.csv", + "journals/journal_abbreviations_ams.csv", + "journals/journal_abbreviations_general.csv", + "journals/journal_abbreviations_geology_physics.csv", + "journals/journal_abbreviations_lifescience.csv", + "journals/journal_abbreviations_mathematics.csv", + "journals/journal_abbreviations_mechanical.csv", + "journals/journal_abbreviations_meteorology.csv", + "journals/journal_abbreviations_sociology.csv", + "journals/journal_abbreviations_webofscience-dots.csv", ] -def main(output_filename): - # Read and merge CSV files - # dfs = [pd.read_csv(file, header=None) for file in import_order] - dfs = [] - for file in import_order: - df = pd.read_csv(file, header=None) - dfs.append(df) - print(f"{file}: {len(df)}") - merged_df = pd.concat(dfs, ignore_index=True) +def load_data(file_paths): + """Load and combine data from CSV files.""" + journal_dict = {} + normalized_keys = set() + for path in file_paths: + with open(path, mode="r", encoding="utf-8") as file: + reader = csv.reader(file) + for row in reader: + name = row[0].strip() + abbr = row[1].strip() - # Drop duplicates based on the first column value and keep the last one obtained - merged_df.drop_duplicates(subset=[0], keep='last', inplace=True) + # Discard entries where name or abbr is missing + if not (name and abbr): + continue + # Discard entries that are too long or too short + if len(name) >= 80 or len(name) <= 3: + continue + # Discard names that start with non-alphanumeric characters + if not name[0].isalnum(): + continue + # Discard names that consist only of numbers + if name.replace(" ", "").isnumeric(): + continue + # Discard names containing \ + if name.count("\\"): + continue + # Discard entries where the first letters of name and abbr do not match + if abbr[0] != name.replace("The", "").replace("A ", "")[0]: + continue + # Only keep the first occurrence + if name in journal_dict: + continue + # Generate normalizedKey, keeping only the first match + normalized_key = normalize_name(name) + if normalized_key in normalized_keys: + continue - # Sort alphabetically - sorted_df = merged_df.sort_values(by=[0]) + journal_dict[name] = abbr + normalized_keys.add(normalized_key) # Add to the set of used keys + return journal_dict - # Save the result to the specified CSV file and ensure values are quoted - sorted_df.to_csv(output_filename, index=False, header=False, quoting=1) - print(f"Write {output_filename}, Combined key count: {len(merged_df)}") +def normalize_name(name): + """ + Normalize the journal name by removing specified characters using regex. + See src/utils/str.ts -> normalizeKey() + """ + return re.sub(r"\b(the|and)\b|[&\-:, ()]", "", name, flags=re.IGNORECASE).lower() + + +def save_to_json(data, output_file): + """Save the data to a JSON file.""" + with open(output_file, mode="w", encoding="utf-8") as json_file: + json.dump(data, json_file, indent=2, ensure_ascii=False) + + +def save_to_csv(data, output_file): + """Save the data to a CSV file.""" + with open(output_file, mode="w", newline="", encoding="utf-8") as csv_file: + writer = csv.writer(csv_file, quoting=1) + for name, abbr in data.items(): + writer.writerow([name, abbr]) + + +def main(filename): + base_path = Path().cwd() + output_filename = base_path / filename + import_paths = [base_path / file for file in import_order] + + journal_data = load_data(import_paths) + sorted_journal_data = dict(sorted(journal_data.items())) # Sort alphabetically + save_to_csv(sorted_journal_data, output_filename) if __name__ == "__main__":