From 2ffcf848122d36991c7d268b1d5d35ce5e53e456 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Thu, 11 Jun 2026 18:30:11 -0700 Subject: [PATCH 1/2] #686 #696 Remove dir options and improve docs --- complexipy-snapshot.json | 388 ---------- .../{load => cli/contracts}/__init__.py | 0 src/scribe_data/cli/contracts/check.py | 81 +-- src/scribe_data/cli/contracts/export.py | 23 +- src/scribe_data/cli/contracts/filter.py | 55 +- src/scribe_data/cli/convert.py | 473 ------------- src/scribe_data/cli/convert/__init__.py | 0 src/scribe_data/cli/convert/to_csv_or_tsv.py | 199 ++++++ src/scribe_data/cli/convert/to_json.py | 165 +++++ .../convert/to_sqlite.py} | 45 +- src/scribe_data/cli/convert/wrapper.py | 102 +++ src/scribe_data/cli/download/__init__.py | 0 .../wikidata_lexeme_dump.py} | 144 +--- .../cli/download/wiktionary_dump.py | 124 ++++ src/scribe_data/cli/get.py | 26 +- src/scribe_data/cli/interactive.py | 663 ------------------ src/scribe_data/cli/interactive/__init__.py | 0 src/scribe_data/cli/interactive/config.py | 43 ++ src/scribe_data/cli/interactive/execute.py | 181 +++++ src/scribe_data/cli/interactive/prompt.py | 191 +++++ src/scribe_data/cli/interactive/run.py | 299 ++++++++ src/scribe_data/cli/list.py | 205 ------ src/scribe_data/cli/list/__init__.py | 0 src/scribe_data/cli/list/data_types.py | 82 +++ src/scribe_data/cli/list/languages.py | 60 ++ src/scribe_data/cli/list/wrapper.py | 67 ++ src/scribe_data/cli/main.py | 281 +++----- src/scribe_data/cli/total.py | 450 ------------ src/scribe_data/cli/total/__init__.py | 0 src/scribe_data/cli/total/print_values.py | 185 +++++ src/scribe_data/cli/total/query.py | 176 +++++ src/scribe_data/cli/total/wrapper.py | 110 +++ src/scribe_data/cli/upgrade.py | 2 + src/scribe_data/cli/version.py | 8 + src/scribe_data/wikidata/parse_dump.py | 8 +- src/scribe_data/wikidata/wikidata_utils.py | 9 +- .../wiktionary/parse_translations.py | 6 +- tests/check/test_check_project_metadata.py | 6 +- ...s_check.py => test_cli_contracts_check.py} | 25 +- ...export.py => test_cli_contracts_export.py} | 71 +- ...export.py => test_cli_contracts_filter.py} | 74 +- .../convert/test_cli_convert_to_csv_or_tsv.py | 156 +++++ tests/cli/convert/test_cli_convert_to_json.py | 171 +++++ .../convert/test_cli_convert_to_sqlite.py} | 111 ++- tests/cli/convert/test_cli_convert_wrapper.py | 143 ++++ ...test_cli_download_wikidata_lexeme_dump.py} | 60 +- .../test_cli_interactive_config.py | 92 +++ .../test_cli_interactive_execute.py | 66 ++ .../test_cli_interactive_prompt.py | 108 +++ .../interactive/test_cli_interactive_run.py | 56 ++ tests/cli/list/test_cli_list_data_types.py | 73 ++ tests/cli/list/test_cli_list_languages.py | 93 +++ tests/cli/list/test_cli_list_wrapper.py | 70 ++ tests/cli/{test_get.py => test_cli_get.py} | 167 ++--- .../{test_upgrade.py => test_cli_upgrade.py} | 28 +- .../cli/{test_utils.py => test_cli_utils.py} | 188 ++++- .../{test_version.py => test_cli_version.py} | 16 +- tests/cli/test_convert.py | 500 ------------- tests/cli/test_interactive.py | 260 ------- tests/cli/test_list.py | 211 ------ tests/cli/test_total.py | 610 ---------------- tests/cli/total/test_cli_total_query.py | 249 +++++++ tests/cli/total/test_cli_total_wrapper.py | 384 ++++++++++ tests/load/test_update_utils.py | 177 ----- ...metadata.py => test_resources_metadata.py} | 10 +- ...> test_unicode_generate_emoji_keywords.py} | 8 +- ..._query.py => test_wikidata_check_query.py} | 152 ++-- .../test_wikidata_dump.py} | 14 +- ...ry_data.py => test_wikidata_query_data.py} | 8 +- ... => test_wiktionary_parse_translations.py} | 0 70 files changed, 4389 insertions(+), 4819 deletions(-) delete mode 100644 complexipy-snapshot.json rename src/scribe_data/{load => cli/contracts}/__init__.py (100%) delete mode 100644 src/scribe_data/cli/convert.py create mode 100644 src/scribe_data/cli/convert/__init__.py create mode 100644 src/scribe_data/cli/convert/to_csv_or_tsv.py create mode 100644 src/scribe_data/cli/convert/to_json.py rename src/scribe_data/{load/data_to_sqlite.py => cli/convert/to_sqlite.py} (95%) create mode 100644 src/scribe_data/cli/convert/wrapper.py create mode 100644 src/scribe_data/cli/download/__init__.py rename src/scribe_data/cli/{download.py => download/wikidata_lexeme_dump.py} (63%) create mode 100644 src/scribe_data/cli/download/wiktionary_dump.py delete mode 100644 src/scribe_data/cli/interactive.py create mode 100644 src/scribe_data/cli/interactive/__init__.py create mode 100644 src/scribe_data/cli/interactive/config.py create mode 100644 src/scribe_data/cli/interactive/execute.py create mode 100644 src/scribe_data/cli/interactive/prompt.py create mode 100644 src/scribe_data/cli/interactive/run.py delete mode 100644 src/scribe_data/cli/list.py create mode 100644 src/scribe_data/cli/list/__init__.py create mode 100644 src/scribe_data/cli/list/data_types.py create mode 100644 src/scribe_data/cli/list/languages.py create mode 100644 src/scribe_data/cli/list/wrapper.py delete mode 100644 src/scribe_data/cli/total.py create mode 100644 src/scribe_data/cli/total/__init__.py create mode 100644 src/scribe_data/cli/total/print_values.py create mode 100644 src/scribe_data/cli/total/query.py create mode 100644 src/scribe_data/cli/total/wrapper.py rename tests/cli/contracts/{test_contracts_check.py => test_cli_contracts_check.py} (88%) rename tests/cli/contracts/{test_export.py => test_cli_contracts_export.py} (61%) rename tests/cli/contracts/{test_contracts_export.py => test_cli_contracts_filter.py} (87%) create mode 100644 tests/cli/convert/test_cli_convert_to_csv_or_tsv.py create mode 100644 tests/cli/convert/test_cli_convert_to_json.py rename tests/{load/test_data_to_sqlite.py => cli/convert/test_cli_convert_to_sqlite.py} (84%) create mode 100644 tests/cli/convert/test_cli_convert_wrapper.py rename tests/cli/{test_download.py => download/test_cli_download_wikidata_lexeme_dump.py} (86%) create mode 100644 tests/cli/interactive/test_cli_interactive_config.py create mode 100644 tests/cli/interactive/test_cli_interactive_execute.py create mode 100644 tests/cli/interactive/test_cli_interactive_prompt.py create mode 100644 tests/cli/interactive/test_cli_interactive_run.py create mode 100644 tests/cli/list/test_cli_list_data_types.py create mode 100644 tests/cli/list/test_cli_list_languages.py create mode 100644 tests/cli/list/test_cli_list_wrapper.py rename tests/cli/{test_get.py => test_cli_get.py} (79%) rename tests/cli/{test_upgrade.py => test_cli_upgrade.py} (94%) rename tests/cli/{test_utils.py => test_cli_utils.py} (65%) rename tests/cli/{test_version.py => test_cli_version.py} (87%) delete mode 100644 tests/cli/test_convert.py delete mode 100644 tests/cli/test_interactive.py delete mode 100644 tests/cli/test_list.py delete mode 100644 tests/cli/test_total.py create mode 100644 tests/cli/total/test_cli_total_query.py create mode 100644 tests/cli/total/test_cli_total_wrapper.py delete mode 100644 tests/load/test_update_utils.py rename tests/resources/{test_metadata.py => test_resources_metadata.py} (84%) rename tests/unicode/{test_generate_emoji_keywords.py => test_unicode_generate_emoji_keywords.py} (94%) rename tests/wikidata/{test_check_query.py => test_wikidata_check_query.py} (83%) rename tests/{cli/test_dump.py => wikidata/test_wikidata_dump.py} (93%) rename tests/wikidata/{test_query_data.py => test_wikidata_query_data.py} (98%) rename tests/wiktionary/{test_parse_translations.py => test_wiktionary_parse_translations.py} (100%) diff --git a/complexipy-snapshot.json b/complexipy-snapshot.json deleted file mode 100644 index fdd4cd3ac..000000000 --- a/complexipy-snapshot.json +++ /dev/null @@ -1,388 +0,0 @@ -[ - { - "path": "scribe_data/check/check_missing_forms/check_missing_forms.py", - "file_name": "check_missing_forms.py", - "functions": [ - { - "name": "get_features_from_sparql_service", - "complexity": 18 - }, - { - "name": "get_forms_from_sparql_service", - "complexity": 23 - }, - { - "name": "execute_sparql_query", - "complexity": 24 - }, - { - "name": "process_missing_features", - "complexity": 27 - }, - { - "name": "get_forms_from_sparql_service_all_languages", - "complexity": 38 - } - ] - }, - { - "path": "scribe_data/check/check_missing_forms/generate_query.py", - "file_name": "generate_query.py", - "functions": [ - { - "name": "generate_query", - "complexity": 43 - } - ] - }, - { - "path": "scribe_data/check/check_missing_forms/pr_body.py", - "file_name": "pr_body.py", - "functions": [ - { - "name": "pr_body", - "complexity": 27 - } - ] - }, - { - "path": "scribe_data/check/check_missing_forms/split_query.py", - "file_name": "split_query.py", - "functions": [ - { - "name": "split_group_by_identifier", - "complexity": 60 - } - ] - }, - { - "path": "scribe_data/check/check_project_metadata.py", - "file_name": "check_project_metadata.py", - "functions": [ - { - "name": "check_language_metadata", - "complexity": 19 - }, - { - "name": "get_available_languages", - "complexity": 19 - }, - { - "name": "validate_language_properties", - "complexity": 21 - }, - { - "name": "get_missing_languages", - "complexity": 24 - } - ] - }, - { - "path": "scribe_data/check/check_project_structure.py", - "file_name": "check_project_structure.py", - "functions": [ - { - "name": "check_data_type_folders", - "complexity": 18 - }, - { - "name": "check_project_structure", - "complexity": 40 - } - ] - }, - { - "path": "scribe_data/check/check_query_forms.py", - "file_name": "check_query_forms.py", - "functions": [ - { - "name": "check_optional_qid_order", - "complexity": 31 - }, - { - "name": "check_query_forms", - "complexity": 39 - } - ] - }, - { - "path": "scribe_data/check/check_query_identifiers.py", - "file_name": "check_query_identifiers.py", - "functions": [ - { - "name": "check_query_identifiers", - "complexity": 16 - } - ] - }, - { - "path": "scribe_data/cli/cli_utils.py", - "file_name": "cli_utils.py", - "functions": [ - { - "name": "validate_language_and_data_type", - "complexity": 36 - }, - { - "name": "print_formatted_data", - "complexity": 43 - } - ] - }, - { - "path": "scribe_data/cli/contracts/check.py", - "file_name": "check.py", - "functions": [ - { - "name": "check_contract_data_completeness", - "complexity": 40 - } - ] - }, - { - "path": "scribe_data/cli/contracts/filter.py", - "file_name": "filter.py", - "functions": [ - { - "name": "filter_exported_data", - "complexity": 17 - }, - { - "name": "export_data_filtered_by_contracts", - "complexity": 20 - }, - { - "name": "filter_contract_metadata", - "complexity": 81 - } - ] - }, - { - "path": "scribe_data/cli/convert.py", - "file_name": "convert.py", - "functions": [ - { - "name": "convert_to_json", - "complexity": 90 - }, - { - "name": "convert_to_csv_or_tsv", - "complexity": 123 - } - ] - }, - { - "path": "scribe_data/cli/download.py", - "file_name": "download.py", - "functions": [ - { - "name": "download_wd_lexeme_dump", - "complexity": 26 - }, - { - "name": "wd_lexeme_dump_download_wrapper", - "complexity": 28 - }, - { - "name": "available_closest_lexeme_dumpfile", - "complexity": 29 - }, - { - "name": "download_wiktionary_dumps", - "complexity": 31 - } - ] - }, - { - "path": "scribe_data/cli/get.py", - "file_name": "get.py", - "functions": [ - { - "name": "get_data", - "complexity": 48 - } - ] - }, - { - "path": "scribe_data/cli/interactive.py", - "file_name": "interactive.py", - "functions": [ - { - "name": "start_interactive_mode", - "complexity": 36 - } - ] - }, - { - "path": "scribe_data/cli/main.py", - "file_name": "main.py", - "functions": [ - { - "name": "main", - "complexity": 104 - } - ] - }, - { - "path": "scribe_data/cli/total.py", - "file_name": "total.py", - "functions": [ - { - "name": "get_datatype_list", - "complexity": 21 - }, - { - "name": "get_total_lexemes", - "complexity": 29 - }, - { - "name": "total_wrapper", - "complexity": 30 - }, - { - "name": "print_total_lexemes", - "complexity": 35 - } - ] - }, - { - "path": "scribe_data/load/data_to_sqlite.py", - "file_name": "data_to_sqlite.py", - "functions": [ - { - "name": "wiktionary_translations_to_sqlite", - "complexity": 17 - }, - { - "name": "data_to_sqlite", - "complexity": 75 - } - ] - }, - { - "path": "scribe_data/unicode/process_unicode.py", - "file_name": "process_unicode.py", - "functions": [ - { - "name": "gen_emoji_lexicon", - "complexity": 45 - } - ] - }, - { - "path": "scribe_data/utils.py", - "file_name": "utils.py", - "functions": [ - { - "name": "_find", - "complexity": 22 - }, - { - "name": "check_lexeme_dump_prompt_download", - "complexity": 35 - } - ] - }, - { - "path": "scribe_data/wikidata/format_data.py", - "file_name": "format_data.py", - "functions": [ - { - "name": "format_data", - "complexity": 32 - } - ] - }, - { - "path": "scribe_data/wikidata/parse_dump.py", - "file_name": "parse_dump.py", - "functions": [ - { - "name": "LexemeProcessor::_build_iso_mapping", - "complexity": 27 - }, - { - "name": "LexemeProcessor::process_lines", - "complexity": 27 - }, - { - "name": "LexemeProcessor::process_file", - "complexity": 34 - }, - { - "name": "LexemeProcessor::export_forms_json", - "complexity": 52 - }, - { - "name": "parse_dump", - "complexity": 78 - }, - { - "name": "LexemeProcessor::_process_forms", - "complexity": 101 - } - ] - }, - { - "path": "scribe_data/wikidata/query_data.py", - "file_name": "query_data.py", - "functions": [ - { - "name": "query_data", - "complexity": 73 - } - ] - }, - { - "path": "scribe_data/wikidata/wikidata_utils.py", - "file_name": "wikidata_utils.py", - "functions": [ - { - "name": "parse_wd_lexeme_dump", - "complexity": 23 - } - ] - }, - { - "path": "scribe_data/wiktionary/parse_translations.py", - "file_name": "parse_translations.py", - "functions": [ - { - "name": "_merge_parsed_into_output", - "complexity": 16 - }, - { - "name": "_resolve_dump_path", - "complexity": 19 - }, - { - "name": "_collect_row_wikilink", - "complexity": 24 - }, - { - "name": "_extract_translation_word", - "complexity": 27 - }, - { - "name": "parse_wiktionary_translations", - "complexity": 30 - }, - { - "name": "_iter_dump_pages", - "complexity": 47 - }, - { - "name": "_parse_ast_u_tabelle", - "complexity": 47 - }, - { - "name": "_parse_block_translations", - "complexity": 50 - }, - { - "name": "parse_xml_dump", - "complexity": 50 - } - ] - } -] diff --git a/src/scribe_data/load/__init__.py b/src/scribe_data/cli/contracts/__init__.py similarity index 100% rename from src/scribe_data/load/__init__.py rename to src/scribe_data/cli/contracts/__init__.py diff --git a/src/scribe_data/cli/contracts/check.py b/src/scribe_data/cli/contracts/check.py index 62c7f1b05..89171221a 100644 --- a/src/scribe_data/cli/contracts/check.py +++ b/src/scribe_data/cli/contracts/check.py @@ -21,30 +21,11 @@ data_contracts_langs[i] = get_language_from_iso(data_contracts_langs[i]) -def check_contracts(output_dir: str | None = None) -> None: - """ - Check data contracts in the specified or default output directory to ensure data completeness. - - Parameters - ---------- - output_dir : Optional[str], optional - Directory containing exported contract data. - If None, uses the default DEFAULT_JSON_EXPORT_DIR. - """ - export_dir = Path(output_dir or DEFAULT_JSON_EXPORT_DIR) - - if not export_dir.exists(): - print( - f"Error: Directory {export_dir} does not exist.\nPlease use export JSON first." - ) - return - - missing_forms = check_contract_data_completeness(export_dir) - print_missing_forms(missing_forms) +# MARK: Check Contracts def check_contract_data_completeness( - export_dir: Path, language: str | None = None + contracts_dir: Path, ) -> dict[str, dict[str, list[str]]]: """ Validate exported data contracts against their metadata requirements. @@ -54,11 +35,9 @@ def check_contract_data_completeness( Parameters ---------- - export_dir : Path - Directory containing exported contract data. - - language : Optional[str], optional - Specific language to check. If None, checks all languages in the directory. + contracts_dir : Path + Directory containing the contracts to filter with. + Defaults to DEFAULT_DATA_CONTRACTS_DIR. Returns ------- @@ -75,25 +54,7 @@ def check_contract_data_completeness( The above is the expected structure. """ # Determine languages to check. - if language: - languages_to_check = [language] - - elif export_dir.exists(): - unique_dirs = {} - for item in export_dir.iterdir(): - if item.is_dir(): - lower_name = item.name.lower() - # Prioritize strictly lowercase directory names to avoid checking capitalized duplicates. - if lower_name not in unique_dirs or item.name == lower_name: - unique_dirs[lower_name] = item.name - - languages_to_check = list(unique_dirs.values()) - - else: - languages_to_check = [ - Path(f).stem.lower() for f in DEFAULT_DATA_CONTRACTS_DIR.glob("*.yaml") - ] - + languages_to_check = [Path(f).stem.lower() for f in contracts_dir.glob("*.yaml")] languages_to_check = [ lang for lang in languages_to_check @@ -107,7 +68,7 @@ def check_contract_data_completeness( # Get ISO code and contract file. try: iso_code = get_language_iso(lang.lower()) - contract_file = DEFAULT_DATA_CONTRACTS_DIR / f"{iso_code.lower()}.yaml" + contract_file = contracts_dir / f"{iso_code.lower()}.yaml" if not contract_file.exists(): print(f"Warning: No contract file found for {lang}") @@ -119,7 +80,7 @@ def check_contract_data_completeness( # Get contract metadata. contract_metadata = filter_contract_metadata(contract_file) - export_lang_dir = export_dir / lang_dir_name + export_lang_dir = DEFAULT_JSON_EXPORT_DIR / lang_dir_name # Check missing forms for nouns and verbs. lang_missing_forms = {} @@ -166,6 +127,9 @@ def check_contract_data_completeness( return missing_forms +# MARK: Print Missing + + def print_missing_forms(missing_forms: dict[str, dict[str, list[str]]]) -> None: """ Print missing forms from data contracts. @@ -187,3 +151,26 @@ def print_missing_forms(missing_forms: dict[str, dict[str, list[str]]]) -> None: print(f" {data_type.capitalize()}:") for form in forms: print(f" - {form}") + + +def check_contract_data_print_missing(contracts_dir: Path) -> None: + """ + Check data contracts in the specified or default output directory to ensure data completeness. + + Parameters + ---------- + contracts_dir : Path + Directory containing the contracts to filter with. + Defaults to DEFAULT_DATA_CONTRACTS_DIR. + """ + + contracts_dir = Path(contracts_dir) if contracts_dir else DEFAULT_DATA_CONTRACTS_DIR + + if not DEFAULT_JSON_EXPORT_DIR.exists(): + print( + f"Error: Directory {DEFAULT_JSON_EXPORT_DIR} does not exist.\nPlease use export JSON first." + ) + return + + missing_forms = check_contract_data_completeness(contracts_dir=contracts_dir) + print_missing_forms(missing_forms) diff --git a/src/scribe_data/cli/contracts/export.py b/src/scribe_data/cli/contracts/export.py index 53de6c443..d3cee1a2c 100644 --- a/src/scribe_data/cli/contracts/export.py +++ b/src/scribe_data/cli/contracts/export.py @@ -8,20 +8,17 @@ from scribe_data.utils import DEFAULT_CONTRACTS_EXPORT_DIR +# MARK: Export Contracts -def export_contracts(output_dir: Path = DEFAULT_CONTRACTS_EXPORT_DIR) -> None: - """ - Export Scribe-Data contracts to the given directory. - Parameters - ---------- - output_dir : str, default=DEFAULT_CONTRACTS_EXPORT_DIR - The directory to export contracts to. +def export_contracts() -> None: + """ + Export Scribe-Data contracts to the default contract export directory. Returns ------- None - Contracts are exported to the given directory. + Contracts are exported to the default contract export directory. """ contracts_source = ( Path(__file__).parent.parent.parent / "resources" / "data_contracts" @@ -31,10 +28,10 @@ def export_contracts(output_dir: Path = DEFAULT_CONTRACTS_EXPORT_DIR) -> None: f"Contracts source directory not found at {contracts_source}." ) - if output_dir.exists(): + if DEFAULT_CONTRACTS_EXPORT_DIR.exists(): response = ( input( - f"A '{output_dir}' folder already exists with the Scribe-Data contracts. " + f"A '{DEFAULT_CONTRACTS_EXPORT_DIR}' folder already exists with the Scribe-Data contracts. " "Do you want to overwrite it? (y/[n]): " ) .strip() @@ -45,7 +42,7 @@ def export_contracts(output_dir: Path = DEFAULT_CONTRACTS_EXPORT_DIR) -> None: print("Export cancelled.") return - shutil.rmtree(output_dir) + shutil.rmtree(DEFAULT_CONTRACTS_EXPORT_DIR) - shutil.copytree(contracts_source, output_dir) - print(f"Contracts successfully exported to {output_dir}.") + shutil.copytree(contracts_source, DEFAULT_CONTRACTS_EXPORT_DIR) + print(f"Contracts successfully exported to {DEFAULT_CONTRACTS_EXPORT_DIR}.") diff --git a/src/scribe_data/cli/contracts/filter.py b/src/scribe_data/cli/contracts/filter.py index 25899c027..853ee28a8 100644 --- a/src/scribe_data/cli/contracts/filter.py +++ b/src/scribe_data/cli/contracts/filter.py @@ -18,6 +18,8 @@ get_language_from_iso, ) +# MARK: Filter Metadata + def filter_contract_metadata(contract_file: Path) -> dict[str, Any]: """ @@ -44,7 +46,6 @@ def filter_contract_metadata(contract_file: Path) -> dict[str, Any]: "verbs": {"conjugations": []}, } - # Filter Numbers if "numbers" in contract_data: numbers = contract_data["numbers"] # Handle different possible structures of numbers. @@ -53,7 +54,7 @@ def filter_contract_metadata(contract_file: Path) -> dict[str, Any]: # Case 1: Simple key-value pair like {"singular": "plural"}. if isinstance(numbers, dict): for key, value in numbers.items(): - # Ignore empty strings + # Ignore empty strings. if key: filtered_numbers.append(key) @@ -63,7 +64,6 @@ def filter_contract_metadata(contract_file: Path) -> dict[str, Any]: # Case 2: List of number types. elif isinstance(numbers, list): - # Filter out empty strings filtered_numbers = [n for n in numbers if n] # Case 3: String of number types. @@ -71,7 +71,7 @@ def filter_contract_metadata(contract_file: Path) -> dict[str, Any]: # Split and filter out empty strings. filtered_numbers = [n for n in numbers.split() if n] - # Remove duplicates and store + # Remove duplicates and store. filtered_metadata["nouns"]["numbers"] = list(set(filtered_numbers)) # Filter Genders. @@ -92,10 +92,8 @@ def filter_contract_metadata(contract_file: Path) -> dict[str, Any]: # Filter Conjugations. if "conjugations" in contract_data: - conjugations = contract_data["conjugations"] - - # Collect all conjugation forms. conj_forms = set() + conjugations = contract_data["conjugations"] # Handle nested conjugation structure. if isinstance(conjugations, dict): @@ -112,6 +110,7 @@ def filter_contract_metadata(contract_file: Path) -> dict[str, Any]: ).split() ] conj_forms.update(cleaned_forms) + elif isinstance(form, list): cleaned_forms = [ f @@ -123,7 +122,6 @@ def filter_contract_metadata(contract_file: Path) -> dict[str, Any]: ] conj_forms.update(cleaned_forms) - # If conjugations is a string, split it. elif isinstance(conjugations, str): # Remove square brackets and split using regex. cleaned_forms = [ @@ -131,7 +129,6 @@ def filter_contract_metadata(contract_file: Path) -> dict[str, Any]: ] conj_forms.update(cleaned_forms) - # If conjugations is a list, use it directly. elif isinstance(conjugations, list): # Remove square bracketed items. cleaned_forms = [ @@ -153,6 +150,9 @@ def filter_contract_metadata(contract_file: Path) -> dict[str, Any]: return {} +# MARK: Filter Export Data + + def filter_exported_data( input_file: Path, contract_metadata: dict[str, Any], data_type: str ) -> dict[str, Any]: @@ -221,9 +221,10 @@ def filter_exported_data( return {} -def export_data_filtered_by_contracts( - contracts_dir: Path, input_dir: Path, output_dir: Path -) -> None: +# MARK: Export Filtered Data + + +def export_data_filtered_by_contracts(contracts_dir: Path) -> None: """ Export contract-filtered data to a new directory with a standardized structure. @@ -236,27 +237,16 @@ def export_data_filtered_by_contracts( Directory containing the contracts to filter with. Defaults to DEFAULT_DATA_CONTRACTS_DIR. - input_dir : Path - Directory containing original JSON export data. - Defaults to DEFAULT_JSON_EXPORT_DIR. - - output_dir : Path - Directory to export filtered contract data. - Defaults to scribe_data_filtered_* based on the data type. - Returns ------- None Prints information on the data that has been filtered. """ - # Use provided output dir or default. - export_dir = Path(output_dir) if output_dir else DEFAULT_FILTERED_JSON_EXPORT_DIR - export_dir.mkdir(parents=True, exist_ok=True) - - input_dir = input_dir or DEFAULT_JSON_EXPORT_DIR - contracts_dir = Path(contracts_dir) if contracts_dir else DEFAULT_DATA_CONTRACTS_DIR + # Use provided output dir or default. + DEFAULT_FILTERED_JSON_EXPORT_DIR.mkdir(parents=True, exist_ok=True) + for contract_filename in os.listdir(contracts_dir): if not contract_filename.endswith(".yaml"): continue @@ -277,10 +267,15 @@ def export_data_filtered_by_contracts( continue # Create language directory in export path. - lang_export_dir = export_dir / matched_language.lower().replace(" ", "_") + lang_export_dir = ( + DEFAULT_FILTERED_JSON_EXPORT_DIR + / matched_language.lower().replace(" ", "_") + ) lang_export_dir.mkdir(parents=True, exist_ok=True) - lang_input_dir = Path(input_dir) / matched_language.lower().replace(" ", "_") + lang_input_dir = Path( + DEFAULT_JSON_EXPORT_DIR + ) / matched_language.lower().replace(" ", "_") if not lang_input_dir.exists(): print(f"No input directory found for {matched_language}") continue @@ -294,7 +289,7 @@ def export_data_filtered_by_contracts( # Skip unsupported types if needed. if data_type not in contract_metadata: output_file = ( - export_dir + DEFAULT_FILTERED_JSON_EXPORT_DIR / matched_language.lower().replace(" ", "_") / f"{data_type}.json" ) @@ -313,7 +308,7 @@ def export_data_filtered_by_contracts( input_file, contract_metadata, data_type ): output_file = ( - export_dir + DEFAULT_FILTERED_JSON_EXPORT_DIR / matched_language.lower().replace(" ", "_") / f"{data_type}.json" ) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py deleted file mode 100644 index a5534f3fc..000000000 --- a/src/scribe_data/cli/convert.py +++ /dev/null @@ -1,473 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later -""" -Functions to convert data returned from the Scribe-Data CLI to other file types. -""" - -import csv -import json -from pathlib import Path - -from scribe_data.load.data_to_sqlite import data_to_sqlite -from scribe_data.utils import ( - DEFAULT_CSV_EXPORT_DIR, - DEFAULT_JSON_EXPORT_DIR, - DEFAULT_SQLITE_EXPORT_DIR, - DEFAULT_TSV_EXPORT_DIR, - DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, - camel_to_snake, - check_index_exists, -) - -# MARK: JSON - - -def convert_to_json( - language: str, - data_types: str | list[str] | None, - input_file: Path, - output_dir: Path, - output_type: str, - overwrite: bool = False, - identifier_case: str = "camel", -) -> None: - """ - Convert a CSV/TSV file to JSON. - - Parameters - ---------- - language : str - The language of the file to convert. - - data_types : Union[str, List[str]] - The data type of the file to convert. - - input_file : Path - The input CSV/TSV file path. - - output_dir : Path - The output directory path for results. - - output_type : str - The output format, should be "json". - - overwrite : bool - Whether to overwrite existing files. - - identifier_case : str - The case format for identifiers. Default is "camel". - - Returns - ------- - None - A JSON file. - """ - if not language: - raise ValueError(f"Language '{language.capitalize()}' is not recognized.") - - data_types = [data_types] if isinstance(data_types, str) else data_types - - if not data_types: - return - - if output_dir is None: - output_dir = DEFAULT_JSON_EXPORT_DIR - - json_output_dir = Path(output_dir) / language.capitalize() - json_output_dir.mkdir(parents=True, exist_ok=True) - - for dtype in data_types: - if not input_file.exists(): - print(f"No data found for {dtype} conversion at '{input_file}'.") - continue - - delimiter = {".csv": ",", ".tsv": "\t"}.get(input_file.suffix.lower()) - - if not delimiter: - raise ValueError( - f"Unsupported file extension '{input_file.suffix}' for {str(input_file)}. Please provide a '.csv' or '.tsv' file." - ) - - try: - with input_file.open("r", encoding="utf-8") as file: - reader = csv.DictReader(file, delimiter=delimiter) - rows = list(reader) - - if not rows: - print(f"No data found in '{input_file}'.") - continue - - # Use the first row to inspect column headers. - first_row = rows[0] - keys = list(first_row.keys()) - data: dict = {} - - if len(keys) == 1: - # Handle Case: { key: None }. - for row in rows: - data[row[keys[0]]] = None - - elif len(keys) == 2: - # Handle Case: { key: value }. - for row in rows: - key = ( - camel_to_snake(row[keys[0]]) - if identifier_case == "snake" - else row[keys[0]] - ) - value = row[keys[1]] - data[key] = value - - elif len(keys) > 2: - if all(col in first_row for col in ["emoji", "is_base", "rank"]): - # Handle Case: { key: [ { emoji: ..., is_base: ..., rank: ... }, { emoji: ..., is_base: ..., rank: ... } ] }. - for row in rows: - if reader.fieldnames and len(reader.fieldnames) > 0: - if identifier_case == "snake": - raw_value = row.get(reader.fieldnames[0]) - key = camel_to_snake(raw_value or "") - - else: - key = row.get(reader.fieldnames[0]) - - emoji = row.get("emoji", "").strip() - is_base = ( - row.get("is_base", "false").strip().lower() == "true" - ) - rank = row.get("rank", None) - rank = int(rank) if rank and rank.isdigit() else None - - entry = {"emoji": emoji, "is_base": is_base, "rank": rank} - - if key is None: - continue - - data.setdefault(key, []).append(entry) - - else: - # Handle Case: { key: { value1: ..., value2: ... } }. - for row in rows: - data[row[keys[0]]] = { - ( - camel_to_snake(k) - if identifier_case == "snake" - else k - ): row[k] - for k in keys[1:] - } - - except (IOError, csv.Error) as e: - print(f"Error reading '{input_file}': {e}") - continue - - # Define output file path - output_file = json_output_dir / f"{dtype}.{output_type}" - - if check_index_exists(output_file, overwrite): - print(f"Skipping {dtype}") - continue - - try: - with output_file.open("w", encoding="utf-8") as file: - json.dump(data, file, ensure_ascii=False, indent=2) - - except IOError as e: - print(f"Error writing to '{output_file}': {e}") - continue - - print(f"Data for {language.capitalize()} {dtype} written to {output_file}") - - -# MARK: CSV or TSV - - -def convert_to_csv_or_tsv( - language: str, - data_types: str | list[str], - input_file: Path, - output_dir: Path, - output_type: str, - overwrite: bool = False, - identifier_case: str = "camel", -) -> None: - """ - Convert a JSON File to CSV/TSV file. - - Parameters - ---------- - language : str - The language of the file to convert. - - data_types : Union[str, List[str]] - The data type of the file to convert. - - input_file : Path - The input JSON file path. - - output_dir : Path - The output directory path for results. - - output_type : str - The output format, should be "csv" or "tsv". - - overwrite : bool - Whether to overwrite existing files. - - identifier_case : str - The case format for identifiers. Default is "camel". - - Returns - ------- - None - A CSV/TSV files. - """ - if not language: - raise ValueError(f"Language '{language.capitalize()}' is not recognized.") - - data_types = [data_types] if isinstance(data_types, str) else data_types - - # Modify input file path to use the provided input_file or default JSON export path. - input_file_path = ( - input_file - or DEFAULT_JSON_EXPORT_DIR / language.lower() / f"{data_types[0]}.json" - ) - - for dtype in data_types: - if not input_file_path.exists(): - print(f"No data found for {dtype} conversion at '{input_file_path}'.") - continue - - try: - with input_file_path.open("r", encoding="utf-8") as f: - data = json.load(f) - - except (IOError, json.JSONDecodeError) as e: - print(f"Error reading '{input_file_path}': {e}") - continue - - # Determine the delimiter based on output type. - delimiter = "," if output_type == "csv" else "\t" - - if output_dir is None: - output_dir = ( - DEFAULT_CSV_EXPORT_DIR - if output_type == "csv" - else DEFAULT_TSV_EXPORT_DIR - ) - - final_output_dir = output_dir / language.capitalize() - final_output_dir.mkdir(parents=True, exist_ok=True) - - output_file = final_output_dir / f"{dtype}.{output_type}" - - if check_index_exists(output_file, overwrite): - print(f"Skipping {dtype}") - continue - - try: - with output_file.open("w", newline="", encoding="utf-8") as file: - writer = csv.writer(file, delimiter=delimiter) - - # Handle different JSON structures based on the format. - if isinstance(data, dict): - first_key = list(data.keys())[0] - - first_val = next(iter(data.values())) if data else None - if isinstance(first_val, dict): - # Handle case: { key: { value1: ..., value2: ... } }. - columns = sorted(first_val.keys()) - header = [ - camel_to_snake(dtype[:-1]) - if identifier_case == "snake" - else dtype[:-1] - ] - header += [ - camel_to_snake(col) if identifier_case == "snake" else col - for col in columns - ] - writer.writerow(header) - - for key, value in data.items(): - row = [key] + [value.get(col, "") for col in columns] - writer.writerow(row) - - elif isinstance(data[first_key], list): - if all(isinstance(item, dict) for item in data[first_key]): - # Handle case: { key: [ { value1: ..., value2: ... } ] }. - if "emoji" in data[first_key][0]: # emoji specific case - columns = ["word", "emoji", "is_base", "rank"] - writer.writerow( - [camel_to_snake(col) for col in columns] - if identifier_case == "snake" - else columns - ) - - for key, value in data.items(): - for item in value: - row = [ - key, - item.get("emoji", ""), - item.get("is_base", ""), - item.get("rank", ""), - ] - writer.writerow(row) - - else: - if identifier_case == "snake": - columns = [camel_to_snake(dtype[:-1])] + [ - camel_to_snake(col) - for col in data[first_key][0].keys() - ] - - else: - columns = [dtype[:-1]] + list( - data[first_key][0].keys() - ) - writer.writerow(columns) - - for key, value in data.items(): - for item in value: - row = [key] + [ - item.get(col, "") for col in columns[1:] - ] - writer.writerow(row) - - elif all(isinstance(item, str) for item in data[first_key]): - # Handle case: { key: [value1, value2, ...] }. - header = [ - camel_to_snake(dtype[:-1]) - if identifier_case == "snake" - else dtype[:-1] - ] - header += [ - f"autosuggestion_{i + 1}" - for i in range(len(data[first_key])) - ] - writer.writerow(header) - for key, value in data.items(): - row = [key] + value - writer.writerow(row) - - else: - # Handle case: { key: value }. - writer.writerow( - [ - camel_to_snake(dtype[:-1]) - if identifier_case == "snake" - else dtype[:-1], - "value", - ] - ) - - for key, value in data.items(): - writer.writerow([key, value]) - - except IOError as e: - print(f"Error writing to '{output_file}': {e}") - continue - - print(f"Data for {language.capitalize()} {dtype} written to '{output_file}'") - - -# MARK: Convert Wrapper - - -def convert_wrapper( - languages: list[str] | None, - data_types: list | None, - input_path: Path, - output_dir: Path, - output_type: str, - overwrite: bool = False, - identifier_case: str = "camel", - all: bool = False, -) -> None: - """ - Convert data to the specified output type: JSON, CSV/TSV, or SQLite. - - Parameters - ---------- - languages : Optional[List[str]] - The language(s) of the data to convert. - - data_types : Optional[List[str]] - The data type(s) of the data to convert. - - input_path : Path - The path to the input file or directory. - - output_dir : Path - The output directory where converted files will be stored. - - output_type : str - The desired output format. Can be 'json', 'csv', 'tsv', or 'sqlite'. - - overwrite : bool, optional, default=False - Whether to overwrite existing output files. - - identifier_case : str, optional, default='camel' - The case format for identifiers. - - all : bool, optional, default=False - Convert all languages and data types. - - Returns - ------- - None - This function does not return any value; it performs a conversion operation. - """ - # Route the function call to the correct conversion function. - if output_dir is None: - output_dir = { - "json": DEFAULT_JSON_EXPORT_DIR, - "csv": DEFAULT_CSV_EXPORT_DIR, - "tsv": DEFAULT_TSV_EXPORT_DIR, - "sqlite": DEFAULT_SQLITE_EXPORT_DIR, - }.get(output_type, DEFAULT_JSON_EXPORT_DIR) - - if input_path is None and data_types: - is_wiktionary = any( - isinstance(dt, str) and dt.startswith("wiktionary") - for dt in (data_types if isinstance(data_types, list) else [data_types]) - ) - input_path = ( - DEFAULT_WIKTIONARY_JSON_EXPORT_DIR - if is_wiktionary - else DEFAULT_JSON_EXPORT_DIR - ) - - if output_type == "json" and languages and data_types: - convert_to_json( - language=languages[0], # only one language possible - data_types=data_types, - input_file=input_path, - output_dir=output_dir, - output_type=output_type, - overwrite=overwrite, - identifier_case=identifier_case, - ) - - elif output_type in {"csv", "tsv"} and languages and data_types: - convert_to_csv_or_tsv( - language=languages[0], # only one language possible - data_types=data_types, - input_file=input_path, - output_dir=output_dir, - output_type=output_type, - overwrite=overwrite, - identifier_case=identifier_case, - ) - - elif output_type == "sqlite": - data_to_sqlite( - languages=languages, - specific_tables=data_types, - identifier_case=identifier_case, - input_file=input_path, - output_file=output_dir, - overwrite=overwrite, - ) - - else: - raise ValueError( - f"Unsupported output type '{output_type}'. Must be 'json', 'csv', 'tsv' or 'sqlite'." - ) diff --git a/src/scribe_data/cli/convert/__init__.py b/src/scribe_data/cli/convert/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/scribe_data/cli/convert/to_csv_or_tsv.py b/src/scribe_data/cli/convert/to_csv_or_tsv.py new file mode 100644 index 000000000..b3197129d --- /dev/null +++ b/src/scribe_data/cli/convert/to_csv_or_tsv.py @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Functions to convert data returned from the Scribe-Data CLI to CSV or TSV files. +""" + +import csv +import json +from pathlib import Path + +from scribe_data.utils import ( + DEFAULT_CSV_EXPORT_DIR, + DEFAULT_JSON_EXPORT_DIR, + DEFAULT_TSV_EXPORT_DIR, + camel_to_snake, + check_index_exists, +) + +# MARK: CSV or TSV + + +def convert_to_csv_or_tsv( + language: str, + data_types: str | list[str], + input_file: Path, + output_type: str, + overwrite: bool = False, + identifier_case: str = "camel", +) -> None: + """ + Convert a JSON File to CSV/TSV file. + + Parameters + ---------- + language : str + The language of the file to convert. + + data_types : Union[str, List[str]] + The data type of the file to convert. + + input_file : Path + The input JSON file path. + + output_type : str + The output format, should be "csv" or "tsv". + + overwrite : bool + Whether to overwrite existing files. + + identifier_case : str + The case format for identifiers. Default is "camel". + + Returns + ------- + None + A CSV/TSV files. + """ + if not language: + raise ValueError(f"Language '{language.capitalize()}' is not recognized.") + + data_types = [data_types] if isinstance(data_types, str) else data_types + + # Modify input file path to use the provided input_file or default JSON export path. + input_file_path = ( + input_file + or DEFAULT_JSON_EXPORT_DIR / language.lower() / f"{data_types[0]}.json" + ) + + for dtype in data_types: + if not input_file_path.exists(): + print(f"No data found for {dtype} conversion at '{input_file_path}'.") + continue + + try: + with input_file_path.open("r", encoding="utf-8") as f: + data = json.load(f) + + except (IOError, json.JSONDecodeError) as e: + print(f"Error reading '{input_file_path}': {e}") + continue + + # Determine the delimiter based on output type. + delimiter = "," if output_type == "csv" else "\t" + + output_dir = ( + DEFAULT_CSV_EXPORT_DIR if output_type == "csv" else DEFAULT_TSV_EXPORT_DIR + ) + + final_output_dir = output_dir / language.capitalize() + final_output_dir.mkdir(parents=True, exist_ok=True) + + output_file = final_output_dir / f"{dtype}.{output_type}" + + if check_index_exists(output_file, overwrite): + print(f"Skipping {dtype}") + continue + + try: + with output_file.open("w", newline="", encoding="utf-8") as file: + writer = csv.writer(file, delimiter=delimiter) + + # Handle different JSON structures based on the format. + if isinstance(data, dict): + first_key = list(data.keys())[0] + + first_val = next(iter(data.values())) if data else None + if isinstance(first_val, dict): + # Handle case: { key: { value1: ..., value2: ... } }. + columns = sorted(first_val.keys()) + header = [ + camel_to_snake(dtype[:-1]) + if identifier_case == "snake" + else dtype[:-1] + ] + header += [ + camel_to_snake(col) if identifier_case == "snake" else col + for col in columns + ] + writer.writerow(header) + + for key, value in data.items(): + row = [key] + [value.get(col, "") for col in columns] + writer.writerow(row) + + elif isinstance(data[first_key], list): + if all(isinstance(item, dict) for item in data[first_key]): + # Handle case: { key: [ { value1: ..., value2: ... } ] }. + if "emoji" in data[first_key][0]: # emoji specific case + columns = ["word", "emoji", "is_base", "rank"] + writer.writerow( + [camel_to_snake(col) for col in columns] + if identifier_case == "snake" + else columns + ) + + for key, value in data.items(): + for item in value: + row = [ + key, + item.get("emoji", ""), + item.get("is_base", ""), + item.get("rank", ""), + ] + writer.writerow(row) + + else: + if identifier_case == "snake": + columns = [camel_to_snake(dtype[:-1])] + [ + camel_to_snake(col) + for col in data[first_key][0].keys() + ] + + else: + columns = [dtype[:-1]] + list( + data[first_key][0].keys() + ) + writer.writerow(columns) + + for key, value in data.items(): + for item in value: + row = [key] + [ + item.get(col, "") for col in columns[1:] + ] + writer.writerow(row) + + elif all(isinstance(item, str) for item in data[first_key]): + # Handle case: { key: [value1, value2, ...] }. + header = [ + camel_to_snake(dtype[:-1]) + if identifier_case == "snake" + else dtype[:-1] + ] + header += [ + f"autosuggestion_{i + 1}" + for i in range(len(data[first_key])) + ] + writer.writerow(header) + for key, value in data.items(): + row = [key] + value + writer.writerow(row) + + else: + # Handle case: { key: value }. + writer.writerow( + [ + camel_to_snake(dtype[:-1]) + if identifier_case == "snake" + else dtype[:-1], + "value", + ] + ) + + for key, value in data.items(): + writer.writerow([key, value]) + + except IOError as e: + print(f"Error writing to '{output_file}': {e}") + continue + + print(f"Data for {language.capitalize()} {dtype} written to '{output_file}'") diff --git a/src/scribe_data/cli/convert/to_json.py b/src/scribe_data/cli/convert/to_json.py new file mode 100644 index 000000000..37d052842 --- /dev/null +++ b/src/scribe_data/cli/convert/to_json.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Functions to convert data returned from the Scribe-Data CLI to JSON files. +""" + +import csv +import json +from pathlib import Path + +from scribe_data.utils import ( + DEFAULT_JSON_EXPORT_DIR, + camel_to_snake, + check_index_exists, +) + +# MARK: JSON + + +def convert_to_json( + language: str, + data_types: str | list[str] | None, + input_file: Path, + output_type: str, + overwrite: bool = False, + identifier_case: str = "camel", +) -> None: + """ + Convert a CSV/TSV file to JSON. + + Parameters + ---------- + language : str + The language of the file to convert. + + data_types : Union[str, List[str]] + The data type of the file to convert. + + input_file : Path + The input CSV/TSV file path. + + output_type : str + The output format, should be "json". + + overwrite : bool + Whether to overwrite existing files. + + identifier_case : str + The case format for identifiers. Default is "camel". + + Returns + ------- + None + A JSON file. + """ + if not language: + raise ValueError(f"Language '{language.capitalize()}' is not recognized.") + + data_types = [data_types] if isinstance(data_types, str) else data_types + + if not data_types: + return + + json_output_dir = Path(DEFAULT_JSON_EXPORT_DIR) / language.capitalize() + json_output_dir.mkdir(parents=True, exist_ok=True) + + for dtype in data_types: + if not input_file.exists(): + print(f"No data found for {dtype} conversion at '{input_file}'.") + continue + + delimiter = {".csv": ",", ".tsv": "\t"}.get(input_file.suffix.lower()) + + if not delimiter: + raise ValueError( + f"Unsupported file extension '{input_file.suffix}' for {str(input_file)}. Please provide a '.csv' or '.tsv' file." + ) + + try: + with input_file.open("r", encoding="utf-8") as file: + reader = csv.DictReader(file, delimiter=delimiter) + rows = list(reader) + + if not rows: + print(f"No data found in '{input_file}'.") + continue + + # Use the first row to inspect column headers. + first_row = rows[0] + keys = list(first_row.keys()) + data: dict = {} + + if len(keys) == 1: + # Handle Case: { key: None }. + for row in rows: + data[row[keys[0]]] = None + + elif len(keys) == 2: + # Handle Case: { key: value }. + for row in rows: + key = ( + camel_to_snake(row[keys[0]]) + if identifier_case == "snake" + else row[keys[0]] + ) + value = row[keys[1]] + data[key] = value + + elif len(keys) > 2: + if all(col in first_row for col in ["emoji", "is_base", "rank"]): + # Handle Case: { key: [ { emoji: ..., is_base: ..., rank: ... }, { emoji: ..., is_base: ..., rank: ... } ] }. + for row in rows: + if reader.fieldnames and len(reader.fieldnames) > 0: + if identifier_case == "snake": + raw_value = row.get(reader.fieldnames[0]) + key = camel_to_snake(raw_value or "") + + else: + key = row.get(reader.fieldnames[0]) + + emoji = row.get("emoji", "").strip() + is_base = ( + row.get("is_base", "false").strip().lower() == "true" + ) + rank = row.get("rank", None) + rank = int(rank) if rank and rank.isdigit() else None + + entry = {"emoji": emoji, "is_base": is_base, "rank": rank} + + if key is None: + continue + + data.setdefault(key, []).append(entry) + + else: + # Handle Case: { key: { value1: ..., value2: ... } }. + for row in rows: + data[row[keys[0]]] = { + ( + camel_to_snake(k) + if identifier_case == "snake" + else k + ): row[k] + for k in keys[1:] + } + + except (IOError, csv.Error) as e: + print(f"Error reading '{input_file}': {e}") + continue + + # Define output file path + output_file = json_output_dir / f"{dtype}.{output_type}" + + if check_index_exists(output_file, overwrite): + print(f"Skipping {dtype}") + continue + + try: + with output_file.open("w", encoding="utf-8") as file: + json.dump(data, file, ensure_ascii=False, indent=2) + + except IOError as e: + print(f"Error writing to '{output_file}': {e}") + continue + + print(f"Data for {language.capitalize()} {dtype} written to {output_file}") diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/cli/convert/to_sqlite.py similarity index 95% rename from src/scribe_data/load/data_to_sqlite.py rename to src/scribe_data/cli/convert/to_sqlite.py index 1cb2c7bd0..903ab0751 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/cli/convert/to_sqlite.py @@ -22,6 +22,8 @@ list_all_languages, ) +# MARK: Operations + def create_table( cursor: sqlite3.Cursor, identifier_case: str, data_type: str, cols: list[str] @@ -33,10 +35,13 @@ def create_table( ---------- cursor : sqlite3.Cursor A sqlite3 cursor. + identifier_case : str Either "camel" or "snake" to determine column naming. + data_type : str The name of the table to be created. + cols : list of str The names of columns for the new table. """ @@ -52,6 +57,7 @@ def create_table( counter += 1 col_name = f"{col_name}_{counter}" col_lower = f"{col_lower}_{counter}" + seen_cols.add(col_lower) processed_cols.append(col_name) @@ -72,8 +78,10 @@ def table_insert(cursor: sqlite3.Cursor, data_type: str, keys: list) -> None: ---------- cursor : sqlite3.Cursor A sqlite3 cursor. + data_type : str The name of the table to be inserted into. + keys : list of any The values to be inserted into the table row. """ @@ -82,12 +90,14 @@ def table_insert(cursor: sqlite3.Cursor, data_type: str, keys: list) -> None: cursor.execute(sql_statement, keys) +# MARK: Wikidata + + def translations_to_sqlite( language_data_type_dict: dict, current_languages: list, identifier_case: str = "snake", input_file: Path = DEFAULT_JSON_EXPORT_DIR, - output_file: Path = DEFAULT_SQLITE_EXPORT_DIR, overwrite: bool = False, ) -> None: """ @@ -97,19 +107,21 @@ def translations_to_sqlite( ---------- language_data_type_dict : dict A dictionary specifying the data types for each language. + current_languages : list A list of current languages. + identifier_case : str, optional The identifier case. Default is "snake". + input_file : str, optional, default=DEFAULT_JSON_EXPORT_DIR The input JSON export directory. - output_file : str, optional, default=DEFAULT_SQLITE_EXPORT_DIR - The output SQLite export directory. + overwrite : bool, optional If True, existing SQLite files will be overwritten without prompting. """ maybe_over = "" - translation_db_path = Path(output_file) / "TranslationData.sqlite" + translation_db_path = Path(DEFAULT_SQLITE_EXPORT_DIR) / "TranslationData.sqlite" if translation_db_path.exists(): if not overwrite: answer = questionary.confirm( @@ -170,11 +182,13 @@ def translations_to_sqlite( print("Translations database processing completed.\n") +# MARK: Wiktionary + + def wiktionary_translations_to_sqlite( language, identifier_case="snake", input_file=DEFAULT_JSON_EXPORT_DIR, - output_file=DEFAULT_SQLITE_EXPORT_DIR, overwrite: bool = False, ): """ @@ -197,9 +211,6 @@ def wiktionary_translations_to_sqlite( input_file : str, optional, default=DEFAULT_JSON_EXPORT_DIR The input JSON export directory. - output_file : str, optional, default=DEFAULT_SQLITE_EXPORT_DIR - The output SQLite export directory. - overwrite : bool, optional If True, existing SQLite files will be overwritten without prompting. """ @@ -221,7 +232,7 @@ def wiktionary_translations_to_sqlite( if not translation_files: return - db_path = Path(output_file) / "TranslationData.sqlite" + db_path = Path(DEFAULT_SQLITE_EXPORT_DIR) / "TranslationData.sqlite" db_path.parent.mkdir(parents=True, exist_ok=True) connection = sqlite3.connect(db_path) cursor = connection.cursor() @@ -280,12 +291,14 @@ def wiktionary_translations_to_sqlite( print(f"Wiktionary translation tables for {language} processed successfully.\n") -def data_to_sqlite( +# MARK: Convert + + +def convert_to_sqlite( languages: list[str] | None = None, specific_tables: str | list[str] | None = None, identifier_case: str = "camel", input_file: Path = DEFAULT_JSON_EXPORT_DIR, - output_file: Path = DEFAULT_SQLITE_EXPORT_DIR, overwrite: bool = False, ) -> None: """ @@ -305,9 +318,6 @@ def data_to_sqlite( input_file : str, optional, default=DEFAULT_JSON_EXPORT_DIR The input JSON export directory. - output_file : str, optional, default=DEFAULT_SQLITE_EXPORT_DIR - The output SQLite export directory. - overwrite : bool, optional If set to True, existing SQLite files will be overwritten without prompting. """ @@ -318,8 +328,7 @@ def data_to_sqlite( ) # Ensure the SQLite export directory exists before creating the database. - sqlite_export_dir = Path(output_file) - sqlite_export_dir.mkdir(parents=True, exist_ok=True) + DEFAULT_SQLITE_EXPORT_DIR.mkdir(parents=True, exist_ok=True) current_language_data = language_metadata data_types = data_type_metadata @@ -376,7 +385,6 @@ def data_to_sqlite( current_languages, identifier_case=identifier_case, input_file=input_file, - output_file=output_file, overwrite=overwrite, ) # Remove "translations" from each language's list so we don't create extra language-specific DBs @@ -392,7 +400,6 @@ def data_to_sqlite( language=lang, identifier_case=identifier_case, input_file=input_file, - output_file=output_file, overwrite=overwrite, ) @@ -421,7 +428,7 @@ def data_to_sqlite( if language_data_type_dict[lang] != []: maybe_over = "" db_file = ( - Path(output_file) + Path(DEFAULT_SQLITE_EXPORT_DIR) / f"{get_language_iso(lang).upper()}LanguageData.sqlite" ) if db_file.exists(): diff --git a/src/scribe_data/cli/convert/wrapper.py b/src/scribe_data/cli/convert/wrapper.py new file mode 100644 index 000000000..6293ede5c --- /dev/null +++ b/src/scribe_data/cli/convert/wrapper.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Wrapper function to convert data returned from the Scribe-Data CLI to other file types. +""" + +from pathlib import Path + +from scribe_data.cli.convert.to_csv_or_tsv import convert_to_csv_or_tsv +from scribe_data.cli.convert.to_json import convert_to_json +from scribe_data.cli.convert.to_sqlite import convert_to_sqlite +from scribe_data.utils import ( + DEFAULT_JSON_EXPORT_DIR, + DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, +) + +# MARK: Convert Wrapper + + +def convert_wrapper( + languages: list[str] | None, + data_types: list | None, + input_path: Path, + output_type: str, + overwrite: bool = False, + identifier_case: str = "camel", + all: bool = False, +) -> None: + """ + Convert data to the specified output type: JSON, CSV/TSV, or SQLite. + + Parameters + ---------- + languages : Optional[List[str]] + The language(s) of the data to convert. + + data_types : Optional[List[str]] + The data type(s) of the data to convert. + + input_path : Path + The path to the input file or directory. + + output_type : str + The desired output format. Can be 'json', 'csv', 'tsv', or 'sqlite'. + + overwrite : bool, optional, default=False + Whether to overwrite existing output files. + + identifier_case : str, optional, default='camel' + The case format for identifiers. + + all : bool, optional, default=False + Convert all languages and data types. + + Returns + ------- + None + This function does not return any value; it performs a conversion operation. + """ + if input_path is None and data_types: + is_wiktionary = any( + isinstance(dt, str) and dt.startswith("wiktionary") + for dt in (data_types if isinstance(data_types, list) else [data_types]) + ) + input_path = ( + DEFAULT_WIKTIONARY_JSON_EXPORT_DIR + if is_wiktionary + else DEFAULT_JSON_EXPORT_DIR + ) + + if output_type == "json" and languages and data_types: + convert_to_json( + language=languages[0], # only one language possible + data_types=data_types, + input_file=input_path, + output_type=output_type, + overwrite=overwrite, + identifier_case=identifier_case, + ) + + elif output_type in {"csv", "tsv"} and languages and data_types: + convert_to_csv_or_tsv( + language=languages[0], # only one language possible + data_types=data_types, + input_file=input_path, + output_type=output_type, + overwrite=overwrite, + identifier_case=identifier_case, + ) + + elif output_type == "sqlite": + convert_to_sqlite( + languages=languages, + specific_tables=data_types, + identifier_case=identifier_case, + input_file=input_path, + overwrite=overwrite, + ) + + else: + raise ValueError( + f"Unsupported output type '{output_type}'. Must be 'json', 'csv', 'tsv' or 'sqlite'." + ) diff --git a/src/scribe_data/cli/download/__init__.py b/src/scribe_data/cli/download/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download/wikidata_lexeme_dump.py similarity index 63% rename from src/scribe_data/cli/download.py rename to src/scribe_data/cli/download/wikidata_lexeme_dump.py index 06d42dbbc..34b042060 100644 --- a/src/scribe_data/cli/download.py +++ b/src/scribe_data/cli/download/wikidata_lexeme_dump.py @@ -19,7 +19,6 @@ DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download, - resolve_lang_iso, ) @@ -36,6 +35,7 @@ def parse_date(date_string: str) -> date | None: ------- datetime.date Parsed date object if the format is valid. + None If the date format is invalid. """ @@ -53,7 +53,7 @@ def parse_date(date_string: str) -> date | None: return None -def available_closest_lexeme_dumpfile( +def available_closest_lexeme_dump_file( target_entity: str, other_old_dumps: list, check_wd_dump_exists: Callable[[str], str | None], @@ -76,6 +76,7 @@ def available_closest_lexeme_dumpfile( ------- str The closest available dump file date (as a string). + None If no suitable dump is found. """ @@ -123,6 +124,7 @@ def download_wd_lexeme_dump( ------- str The URL of the requested or closest available dump. + None If no suitable dump is found or the request fails. """ @@ -179,7 +181,7 @@ def check_wd_dump_exists(target_entity: str) -> str | None: return if other_old_dumps: - if closest_date := available_closest_lexeme_dumpfile( + if closest_date := available_closest_lexeme_dump_file( target_entity, other_old_dumps, check_wd_dump_exists ): print( @@ -206,9 +208,7 @@ def check_wd_dump_exists(target_entity: str) -> str | None: def wd_lexeme_dump_download_wrapper( - dump_snapshot: str | None = None, - output_dir: Path | None = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - default: bool = False, + dump_snapshot: str | None = None, default: bool = False ) -> Path | bool | None: """ Download Wikidata lexeme dumps given user preferences. @@ -218,10 +218,6 @@ def wd_lexeme_dump_download_wrapper( dump_snapshot : str Optional date string in YYYYMMDD format for specific dumps. - output_dir : Path - Optional directory path for the downloaded file. - Defaults to 'scribe_data_wikidata_dumps_export' directory. - default : bool, optional If True, skips the user confirmation prompt. Defaults to False. @@ -234,13 +230,13 @@ def wd_lexeme_dump_download_wrapper( - Returns None if the user chooses not to proceed with the download or no valid dump URL is found. """ try: - output_dir = output_dir or DEFAULT_WIKIDATA_DUMP_EXPORT_DIR - - os.makedirs(output_dir, exist_ok=True) + os.makedirs(DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, exist_ok=True) # Don't check for lexeme if date given. if not dump_snapshot: - if useable_file_dir := check_lexeme_dump_prompt_download(output_dir): + if useable_file_dir := check_lexeme_dump_prompt_download( + DEFAULT_WIKIDATA_DUMP_EXPORT_DIR + ): return useable_file_dir dump_url = download_wd_lexeme_dump(dump_snapshot or "latest-lexemes") @@ -250,11 +246,7 @@ def wd_lexeme_dump_download_wrapper( return None filename = dump_url.split("/")[-1] - output_path = ( - output_dir / filename - if output_dir - else DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR / filename - ) + output_path = DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR / filename # Use default parameter to bypass user confirmation. user_response = ( @@ -294,117 +286,3 @@ def wd_lexeme_dump_download_wrapper( except Exception as e: rprint(f"[bold red]An error occurred: {e}[/bold red]") - - -def download_wiktionary_dumps( - output_dir: Path = DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, - language_isos: list[str] = ["en"], - dump_snapshot: str | None = "latest", -) -> Path | None: - """ - Download the latest Wiktionary pages-articles dump based on passed language isos. - - Parameters - ---------- - output_dir : Path, optional, default=DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR - Directory to save the dump. Defaults to DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR. - - language_isos : List[str], optional, default=['en'] - A list of ISO-2 codes for desired Wiktionary dumps. - - dump_snapshot : str, optional, default='latest' - The Wiktionary dump snapshot to be downloaded. - - Returns - ------- - Path - Path to the downloaded file, or None if aborted/failed. - """ - if isinstance(language_isos, str): - language_isos = [language_isos] - - resolved_isos = [] - not_included_isos = [] - for lang in language_isos: - iso = resolve_lang_iso(lang) - if iso: - resolved_isos.append(iso) - - else: - not_included_isos.append(lang) - - if not_included_isos: - iso_or_isos = "iso" if len(not_included_isos) == 1 else "isos" - is_or_are = "is" if len(not_included_isos) == 1 else "are" - rprint( - f"[bold red]The following {iso_or_isos} {is_or_are} not included: {', '.join(not_included_isos)}[/bold red]" - ) - return None - - language_isos = resolved_isos - wiktionaries = [f"{iso}wiktionary" for iso in language_isos] - wiktionary_urls = [f"https://dumps.wikimedia.org/{w}" for w in wiktionaries] - - Path(output_dir).mkdir(parents=True, exist_ok=True) - for i, w, u in zip(language_isos, wiktionaries, wiktionary_urls): - # Note: Remove the snapshot from the resulting filename so Scribe-Server always looks for one file. - filename = f"{w}-pages-articles.xml.bz2" - download_filename = f"{w}-{dump_snapshot}-pages-articles.xml.bz2" - download_url = f"{u}/{dump_snapshot}/{download_filename}" - - rprint(f"[bold blue]Checking dump validity at {download_url}...[/bold blue]") - try: - response = requests.head(download_url, timeout=30) - response.raise_for_status() - - except requests.exceptions.RequestException as e: - rprint(f"[bold red]Invalid dump date or dump not found: {e}[/bold red]") - return None - - output_path = output_dir / filename - - if output_path.exists(): - rprint(f"[bold yellow]Dump already exists: {output_path}[/bold yellow]") - user_input = questionary.select( - "Do you want to:", - choices=[ - "Skip download", - "Download and overwrite", - ], - ).ask() - if user_input == "Skip download": - rprint("[bold green]Skipping download.[/bold green]") - return output_path - - rprint(f"[bold blue]Downloading to {output_path}...[/bold blue]") - try: - response = requests.get(download_url, stream=True, timeout=30) - response.raise_for_status() - total_size = int(response.headers.get("content-length", 0)) - - with open(output_path, "wb") as f: - with tqdm( - total=total_size, - unit="iB", - unit_scale=True, - desc=download_filename, - ) as pbar: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - pbar.update(len(chunk)) - - rprint( - f"[bold green]{i.upper()}Wiktionary dump download completed successfully![/bold green]" - ) - return output_path - - except requests.exceptions.RequestException as e: - rprint(f"[bold red]Download failed: {e}[/bold red]") - return None - - iso_or_isos = "iso" if len(not_included_isos) == 1 else "isos" - was_or_were = "was" if len(not_included_isos) == 1 else "were" - rprint( - f"[bold green]The following {iso_or_isos} {was_or_were} successfully downloaded: {', '.join(not_included_isos)}[/bold green]" - ) diff --git a/src/scribe_data/cli/download/wiktionary_dump.py b/src/scribe_data/cli/download/wiktionary_dump.py new file mode 100644 index 000000000..2cb92bc41 --- /dev/null +++ b/src/scribe_data/cli/download/wiktionary_dump.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Functions for downloading Wiktionary dumps. +""" + +from pathlib import Path + +import questionary +import requests +from rich import print as rprint +from tqdm import tqdm + +from scribe_data.utils import ( + DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, + resolve_lang_iso, +) + + +def download_wiktionary_dumps( + language_isos: list[str] = ["en"], dump_snapshot: str | None = "latest" +) -> Path | None: + """ + Download the latest Wiktionary pages-articles dump based on passed language isos. + + Parameters + ---------- + language_isos : List[str], optional, default=['en'] + A list of ISO-2 codes for desired Wiktionary dumps. + + dump_snapshot : str, optional, default='latest' + The Wiktionary dump snapshot to be downloaded. + + Returns + ------- + Path + Path to the downloaded file, or None if aborted/failed. + """ + if isinstance(language_isos, str): + language_isos = [language_isos] + + resolved_isos = [] + not_included_isos = [] + for lang in language_isos: + if iso := resolve_lang_iso(lang): + resolved_isos.append(iso) + + else: + not_included_isos.append(lang) + + if not_included_isos: + iso_or_isos = "iso" if len(not_included_isos) == 1 else "isos" + is_or_are = "is" if len(not_included_isos) == 1 else "are" + rprint( + f"[bold red]The following {iso_or_isos} {is_or_are} not included: {', '.join(not_included_isos)}[/bold red]" + ) + return None + + language_isos = resolved_isos + wiktionaries = [f"{iso}wiktionary" for iso in language_isos] + wiktionary_urls = [f"https://dumps.wikimedia.org/{w}" for w in wiktionaries] + + Path(DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR).mkdir(parents=True, exist_ok=True) + for i, w, u in zip(language_isos, wiktionaries, wiktionary_urls): + # Note: Remove the snapshot from the resulting filename so Scribe-Server always looks for one file. + filename = f"{w}-pages-articles.xml.bz2" + download_filename = f"{w}-{dump_snapshot}-pages-articles.xml.bz2" + download_url = f"{u}/{dump_snapshot}/{download_filename}" + + rprint(f"[bold blue]Checking dump validity at {download_url}...[/bold blue]") + try: + response = requests.head(download_url, timeout=30) + response.raise_for_status() + + except requests.exceptions.RequestException as e: + rprint(f"[bold red]Invalid dump date or dump not found: {e}[/bold red]") + return None + + output_path = DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR / filename + + if output_path.exists(): + rprint(f"[bold yellow]Dump already exists: {output_path}[/bold yellow]") + user_input = questionary.select( + "Do you want to:", + choices=[ + "Skip download", + "Download and overwrite", + ], + ).ask() + if user_input == "Skip download": + rprint("[bold green]Skipping download.[/bold green]") + return output_path + + rprint(f"[bold blue]Downloading to {output_path}...[/bold blue]") + try: + response = requests.get(download_url, stream=True, timeout=30) + response.raise_for_status() + total_size = int(response.headers.get("content-length", 0)) + + with open(output_path, "wb") as f: + with tqdm( + total=total_size, + unit="iB", + unit_scale=True, + desc=download_filename, + ) as pbar: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + pbar.update(len(chunk)) + + rprint( + f"[bold green]{i.upper()}Wiktionary dump download completed successfully![/bold green]" + ) + return output_path + + except requests.exceptions.RequestException as e: + rprint(f"[bold red]Download failed: {e}[/bold red]") + return None + + iso_or_isos = "iso" if len(not_included_isos) == 1 else "isos" + was_or_were = "was" if len(not_included_isos) == 1 else "were" + rprint( + f"[bold green]The following {iso_or_isos} {was_or_were} successfully downloaded: {', '.join(not_included_isos)}[/bold green]" + ) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 71f7b0c18..b0d5a9f96 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -14,7 +14,7 @@ from rich import print as rprint from SPARQLWrapper.SPARQLExceptions import EndPointInternalError -from scribe_data.cli.convert import convert_wrapper +from scribe_data.cli.convert.wrapper import convert_wrapper from scribe_data.unicode.generate_emoji_keywords import generate_emoji from scribe_data.utils import ( DEFAULT_CSV_EXPORT_DIR, @@ -33,7 +33,6 @@ def get_data( languages: list[str] | None = None, data_types: list[str] | None = None, output_type: str = "json", - output_dir: Path | None = None, overwrite: bool = False, outputs_per_entry: int = 0, all_bool: bool = False, @@ -56,9 +55,6 @@ def get_data( output_type : str The output file type. - output_dir : Path - The output directory path for results. - overwrite : bool, default=False Whether to overwrite existing files. @@ -88,17 +84,16 @@ def get_data( """ # MARK: Defaults - if output_dir is None: - if data_types == ["translations"]: - output_dir = DEFAULT_WIKTIONARY_JSON_EXPORT_DIR + if data_types == ["translations"]: + output_dir = DEFAULT_WIKTIONARY_JSON_EXPORT_DIR - else: - output_dir = { - "csv": DEFAULT_CSV_EXPORT_DIR, - "json": DEFAULT_JSON_EXPORT_DIR, - "sqlite": DEFAULT_SQLITE_EXPORT_DIR, - "tsv": DEFAULT_TSV_EXPORT_DIR, - }.get(output_type, DEFAULT_JSON_EXPORT_DIR) + else: + output_dir = { + "csv": DEFAULT_CSV_EXPORT_DIR, + "json": DEFAULT_JSON_EXPORT_DIR, + "sqlite": DEFAULT_SQLITE_EXPORT_DIR, + "tsv": DEFAULT_TSV_EXPORT_DIR, + }.get(output_type, DEFAULT_JSON_EXPORT_DIR) language_or_languages = ( "language" if languages and len(languages) == 1 else "languages" @@ -305,7 +300,6 @@ def print_error_and_suggestions(error_message: str) -> None: languages=[language_or_sub_language], data_types=data_types, input_path=json_input_path, - output_dir=output_dir, output_type=output_type, overwrite=overwrite, identifier_case=identifier_case, diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py deleted file mode 100644 index cb208e6c2..000000000 --- a/src/scribe_data/cli/interactive.py +++ /dev/null @@ -1,663 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later -""" -Interactive mode functionality for the Scribe-Data CLI to allow users to select request arguments. -""" - -import logging -from pathlib import Path - -import questionary -from prompt_toolkit import prompt -from prompt_toolkit.completion import WordCompleter -from rich import print as rprint -from rich.console import Console -from rich.logging import RichHandler -from rich.table import Table -from tqdm import tqdm - -from scribe_data.cli.convert import convert_wrapper - -# from scribe_data.cli.list import list_wrapper -from scribe_data.cli.get import get_data -from scribe_data.cli.total import total_wrapper -from scribe_data.utils import ( - DEFAULT_JSON_EXPORT_DIR, - DEFAULT_SQLITE_EXPORT_DIR, - DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, - DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, - data_type_metadata, - language_metadata, - list_all_languages, - resolve_lang_iso, -) -from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump - -# MARK: Config Setup - -logging.basicConfig( - level=logging.INFO, - format="%(message)s", - datefmt="[%X]", - handlers=[RichHandler(markup=True)], # Enable markup for colors -) -console = Console() -logger = logging.getLogger("rich") -THANK_YOU_MESSAGE = "[bold cyan]Thank you for using Scribe-Data![/bold cyan]" - - -class ScribeDataConfig: - """ - Class for the configuration of the interactive mode. - """ - - def __init__(self) -> None: - """ - Configure the interactive mode. - """ - self.languages = list_all_languages(language_metadata) - self.data_types = list(data_type_metadata.keys()) - self.selected_languages: list[str] = [] - self.selected_data_types: list[str] = [] - self.output_type: str = "json" - self.output_dir: Path = DEFAULT_JSON_EXPORT_DIR - self.overwrite: bool = False - self.configured: bool = False - self.identifier_case: str = "camel" - self.input_dir: Path = DEFAULT_JSON_EXPORT_DIR - self.output_dir_sqlite: Path = DEFAULT_SQLITE_EXPORT_DIR - - -config = ScribeDataConfig() - - -# MARK: Summary - - -def display_summary() -> None: - """ - Display a summary of the interactive mode request to run. - """ - table = Table( - title="Scribe-Data Request Configuration Summary", style="bright_white" - ) - - table.add_column("Setting", style="bold cyan", no_wrap=True) - table.add_column("Value(s)", style="magenta") - - table.add_row("Languages", ", ".join(config.selected_languages) or "None") - table.add_row("Data Types", ", ".join(config.selected_data_types) or "None") - table.add_row("Output Type", config.output_type) - table.add_row("Output Directory", str(config.output_dir)) - table.add_row("Overwrite", "Yes" if config.overwrite else "No") - - console.print("\n") - console.print(table, justify="left") - console.print("\n") - - -# Helper function to create a WordCompleter. -def create_word_completer( - options: list[str], include_all: bool = False -) -> WordCompleter: - """ - Return a word completer object of the given options. - - Parameters - ---------- - options : List[str] - The options that could complete the current input. - - include_all : bool - Whether 'All' should be an option. - - Returns - ------- - WordCompleter - The word completer object from which completions can be shown to the user. - """ - if include_all: - options = ["All"] + options - - return WordCompleter(options, ignore_case=True) - - -# MARK: Language Selection - - -def prompt_for_languages() -> None: - """ - Request language and data type for lexeme totals. - - Returns - ------- - None - Languages are added to the configuration or are asked for. - """ - language_completer = create_word_completer(config.languages, include_all=True) - initial_language_selection = ", ".join(config.selected_languages) - selected_languages = prompt( - "Select languages (comma-separated or 'All'): ", - default=initial_language_selection, - completer=language_completer, - ) - if "All" in selected_languages: - config.selected_languages = config.languages - - elif selected_languages.strip(): # check if input is not just whitespace - config.selected_languages = [ - lang.strip() - for lang in selected_languages.split(",") - if lang.strip() in config.languages - ] - - if not config.selected_languages: - rprint("[yellow]No language selected. Please try again.[/yellow]") - return prompt_for_languages() - - -def _wiktionary_dump_search_dirs(location: Path) -> list[Path]: - """ - Build an ordered list of directories to search for Wiktionary dumps. - - Each candidate directory is resolved and included only if it exists. - Duplicate paths are omitted while preserving the following search order: - - 1. The provided ``location`` directory. - 2. The default export directory (:data:`~scribe_data.utils.DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR`). - 3. The default export directory under every ancestor of the current working directory. - 4. The current working directory itself. - - Searching ancestor directories allows dumps to be found when the interactive mode - is started from a nested folder (e.g., ``scribe_data_wiktionary_json_export/spanish``). - - Parameters - ---------- - location : Path - User-supplied dump path or search root from - :func:`resolve_wiktionary_dump_path`. - - Returns - ------- - list[Path] - A deduplicated list of existing directories to search. - """ - candidates = [ - location, - DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, - *(parent / DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR for parent in Path.cwd().parents), - Path.cwd(), - ] - resolved_paths = [path.expanduser().resolve() for path in candidates] - return list(dict.fromkeys(path for path in resolved_paths if path.is_dir())) - - -def resolve_wiktionary_dump_path(language: str, location: str | Path) -> Path | None: - """ - Resolve a Wiktionary dump file for the given source language. - - Locates the newest Wiktionary XML dump for the specified language. - If the ``location`` argument points directly to a file, that file is returned. - Otherwise, it searches through a prioritized list of directories for dumps - matching the ``{iso}wiktionary*pages-articles.xml*`` pattern. - - Parameters - ---------- - language : str - Source language name (e.g. ``german``). - - location : str or Path - Path to a specific dump file, or a base directory to begin searching from. - - Returns - ------- - Path or None - The path to the newest matching dump file, the explicit file if ``location`` - is a file, or ``None`` if no matching dump is found. - """ - path = Path(location).expanduser().resolve() - if path.is_file(): - return path - - if not (iso := resolve_lang_iso(language)): - return None - - dumps = [ - dump_path - for search_dir in _wiktionary_dump_search_dirs(path) - for dump_path in search_dir.glob(f"{iso}wiktionary*pages-articles.xml*") - ] - return ( - max(dumps, key=lambda dump_path: dump_path.stat().st_mtime).resolve() - if dumps - else None - ) - - -# MARK: Data Type Selection - - -def prompt_for_data_types() -> None: - """ - Prompt the user to select data types. - - Returns - ------- - None - Data types are added to the configuration or are asked for. - """ - data_type_completer = create_word_completer(config.data_types, include_all=True) - initial_data_type_selection = ", ".join(config.selected_data_types) - - while True: - selected_data_types = prompt( - "Select data types (comma-separated or 'All'): ", - default=initial_data_type_selection, - completer=data_type_completer, - ) - if "All" in selected_data_types.capitalize(): - config.selected_data_types = config.data_types - break - - elif selected_data_types.strip(): # check if input is not just whitespace - config.selected_data_types = [ - dt.strip() - for dt in selected_data_types.split(",") - if dt.strip() in config.data_types - ] - if config.selected_data_types: - break # exit loop if valid data types are selected - - rprint("[yellow]No data type selected. Please try again.[/yellow]") - - -def configure_settings() -> None: - """ - Configure the settings of the interactive mode request. - - Asks for: - - Languages - - Data types - - Output type - - Output directory - - Whether to overwrite - """ - rprint( - "[cyan]Follow the prompts below. Press tab for completions and enter to select.[/cyan]" - ) - prompt_for_languages() - prompt_for_data_types() - - # MARK: Outputs - - output_type_completer = create_word_completer(["json", "csv", "tsv"]) - config.output_type = prompt( - "Select output type (json/csv/tsv): ", - default="json", - completer=output_type_completer, - ) - while config.output_type not in ["json", "csv", "tsv"]: - rprint("[yellow]Invalid output type selected. Please try again.[/yellow]") - config.output_type = prompt( - "Select output type (json/csv/tsv): ", - default="json", - completer=output_type_completer, - ) - - # MARK: Output Directory - - if output_dir := prompt(f"Enter output directory (default: {config.output_dir}): "): - config.output_dir = Path(output_dir) - - # MARK: Overwrite Confirmation - - overwrite_completer = create_word_completer(["Y", "n"]) - overwrite = ( - prompt("Overwrite existing files? (Y/n): ", completer=overwrite_completer) - or "y" - ) - config.overwrite = overwrite.lower() == "y" - - config.configured = True - display_summary() - - -def run_request() -> None: - """ - Execute the interactive mode request based on current configuration. - - Returns - ------- - None - An interactive mode request is ran. - """ - if not config.selected_languages or not config.selected_data_types: - rprint("[bold red]Error: Please configure languages and data types.[/bold red]") - return - - # Calculate total operations - total_operations = len(config.selected_languages) * len(config.selected_data_types) - - # MARK: Export Data - - with tqdm( - total=total_operations, - desc="Exporting data", - unit="operation", - ) as pbar: - for language in config.selected_languages: - for data_type in config.selected_data_types: - pbar.set_description(f"Exporting {language} {data_type} data") - - try: - get_data( - languages=[language], - data_types=[data_type], - output_type=config.output_type, - output_dir=config.output_dir, - overwrite=config.overwrite, - interactive=True, - ) - # The data was successfully written to file, so we can log success - logger.info( - f"[green]✔ Successfully exported {language} {data_type} data.[/green]" - ) - except Exception as e: - logger.error( - f"[red]✖ Failed to export {language} {data_type} data: {str(e)}[/red]" - ) - - pbar.update(1) - - if config.overwrite: - rprint("[bold green]Data request completed successfully![/bold green]") - - -def request_total_lexeme_loop() -> None: - """ - Continuously prompts for lexeme requests until exit. - """ - while True: - choice = questionary.select( - "What would you like to do?", - choices=[ - questionary.Choice("Configure total lexemes request", "total"), - questionary.Choice("Run total lexemes request", "run"), - questionary.Choice( - "Run total lexemes request with lexeme dumps", "run_all" - ), - questionary.Choice("Exit", "exit"), - ], - ).ask() - - if choice == "run": - total_wrapper( - languages=config.selected_languages, - data_types=config.selected_data_types, - all_bool=False, - ) - config.selected_languages, config.selected_data_types = [], [] - rprint(THANK_YOU_MESSAGE) - break - - elif choice == "run_all": - if wikidata_dump_path := prompt( - f"Enter Wikidata lexeme dump path (default: {str(DEFAULT_WIKIDATA_DUMP_EXPORT_DIR)}): " - ): - wikidata_dump_path = Path(wikidata_dump_path) - - else: - wikidata_dump_path = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR - - parse_wd_lexeme_dump( - languages=config.selected_languages, - wikidata_dump_path=wikidata_dump_path, - wikidata_dump_type=["total"], - interactive_mode=True, - ) - break - - elif choice == "exit": - return - - else: - prompt_for_languages() - prompt_for_data_types() - - -# MARK: List - -# def see_list_languages(): -# """ -# See list of languages. -# """ - -# choice = select( -# "What would you like to list?", -# choices=[ -# Choice("All languages", "all_languages"), -# Choice("Languages for a specific data type", "languages_for_data_type"), -# Choice("Data types for a specific language", "data_types_for_language"), -# ], -# ).ask() - -# if choice == "all_languages": -# list_wrapper(all_bool=True) - -# elif choice == "languages_for_data_type": -# list_wrapper(data_type=True) - -# elif choice == "data_types_for_language": -# list_wrapper(language=True) - - -# MARK: Start - - -def start_interactive_mode(operation: str | None = None) -> None: - """ - Entry point for interactive mode. - - Parameters - ---------- - operation : str - The type of operation that interactive mode is being ran with. - """ - while True: - # Check if both selected_languages and selected_data_types are empty. - if config.selected_languages or config.selected_data_types: - choices = [ - questionary.Choice("Configure get data request", "configure"), - questionary.Choice("Exit", "exit"), - ] - - if config.configured: - choices.insert( - 1, questionary.Choice("Run get data request with WDQS", "run") - ) - choices.insert( - 2, - questionary.Choice( - "Run get lexemes request with lexeme dumps", "run_all" - ), - ) - - elif config.selected_languages and config.selected_data_types: - choices.insert( - 1, questionary.Choice("Request for convert JSON", "convert_json") - ) - - else: - choices.insert( - 1, questionary.Choice("Request for total lexeme", "total") - ) - - elif operation == "get": - choices = [ - questionary.Choice("Configure get data request", "configure"), - # Choice("See list of languages", "languages"), - questionary.Choice("Exit", "exit"), - ] - - elif operation == "total": - choices = [ - questionary.Choice("Configure total lexemes request", "total"), - # Choice("See list of languages", "languages"), - questionary.Choice("Exit", "exit"), - ] - - elif operation == "convert": - choices = [ - questionary.Choice("Configure convert request", "convert"), - questionary.Choice("Exit", "exit"), - ] - - elif operation == "translations": - choices = [ - questionary.Choice("Configure translations request", "translations"), - # Choice("See list of languages", "languages"), - questionary.Choice("Exit", "exit"), - ] - - choice = questionary.select("What would you like to do?", choices=choices).ask() - - if choice == "configure": - configure_settings() - - elif choice == "run_all": - if wikidata_dump_path := prompt( - f"Enter Wikidata lexeme dump path (default: {str(DEFAULT_WIKIDATA_DUMP_EXPORT_DIR)}): " - ): - wikidata_dump_path = Path(wikidata_dump_path) - - else: - wikidata_dump_path = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR - - parse_wd_lexeme_dump( - languages=config.selected_languages, - data_types=config.selected_data_types, - wikidata_dump_type=["form"], - output_dir=config.output_dir, - wikidata_dump_path=wikidata_dump_path, - overwrite_all=config.overwrite, - interactive_mode=True, - ) - rprint(THANK_YOU_MESSAGE) - break - - elif choice == "total": - prompt_for_languages() - prompt_for_data_types() - request_total_lexeme_loop() - break - - elif choice == "convert": - prompt_for_languages() - prompt_for_data_types() - - # Use the default explicitly so that if the user enters nothing, the default value is retained. - user_input_dir = prompt( - f"Enter input directory (default: {config.input_dir}): ", - default=str(config.input_dir), - ) - config.input_dir = Path(user_input_dir) - - user_output_dir = prompt( - f"Enter output directory (default: {config.output_dir_sqlite}): ", - default=str(config.output_dir_sqlite), - ) - config.output_dir_sqlite = Path(user_output_dir) - - identifier_case = prompt( - "Enter identifier case (default: camel): ", - default="camel", - ) - output_type = prompt( - "Enter output type (default: sqlite): ", - default="sqlite", - ) - overwrite_str = prompt( - "Overwrite existing files? (default: False): ", - default="False", - ) - overwrite_bool = overwrite_str.strip().lower() in ("true", "y", "yes") - - convert_wrapper( - languages=config.selected_languages, - data_types=config.selected_data_types, - input_path=config.input_dir, # Use the updated configuration value - output_dir=config.output_dir_sqlite, - output_type=output_type, - identifier_case=identifier_case, - overwrite=overwrite_bool, - ) - break - - elif choice == "translations": - from scribe_data.wiktionary.parse_translations import ( - parse_wiktionary_translations, - ) - - while True: - wiktionary_dump_language = prompt( - "Select Wiktionary dump source language: ", - default="english", - completer=create_word_completer(config.languages), - ).strip() - if wiktionary_dump_language in config.languages: - break - rprint( - f"[bold red]Error: {wiktionary_dump_language} is not a valid language.[/bold red]" - ) - - dump_location = prompt( - "Enter Wiktionary dump directory or file path " - f"(default: {DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR}): ", - default=str(DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR), - ) - wiktionary_dump_path = resolve_wiktionary_dump_path( - wiktionary_dump_language, - dump_location, - ) - if not wiktionary_dump_path: - rprint( - f"[bold red]No {wiktionary_dump_language} Wiktionary dump found at " - f"{dump_location}.[/bold red]" - ) - break - - prompt_for_languages() - - translations_output_dir = prompt( - "Enter output directory " - f"(default: {DEFAULT_WIKTIONARY_JSON_EXPORT_DIR}): ", - default=str(DEFAULT_WIKTIONARY_JSON_EXPORT_DIR), - ) - - overwrite_str = prompt( - "Overwrite existing files? (default: False): ", - default="False", - ) - overwrite_bool = overwrite_str.strip().lower() in ("true", "y", "yes") - - parse_wiktionary_translations( - target_languages=config.selected_languages, - wiktionary_dump_path=Path(wiktionary_dump_path), - output_dir=Path(translations_output_dir), - overwrite=overwrite_bool, - ) - - break - - elif choice == "run": - run_request() - rprint(THANK_YOU_MESSAGE) - break - - else: - rprint(THANK_YOU_MESSAGE) - break - - -if __name__ == "__main__": - start_interactive_mode() diff --git a/src/scribe_data/cli/interactive/__init__.py b/src/scribe_data/cli/interactive/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/scribe_data/cli/interactive/config.py b/src/scribe_data/cli/interactive/config.py new file mode 100644 index 000000000..13bcf58a3 --- /dev/null +++ b/src/scribe_data/cli/interactive/config.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Interactive mode configuration for the Scribe-Data CLI to allow users to select request arguments. +""" + +from pathlib import Path + +from scribe_data.utils import ( + DEFAULT_JSON_EXPORT_DIR, + DEFAULT_SQLITE_EXPORT_DIR, + data_type_metadata, + language_metadata, + list_all_languages, +) + +THANK_YOU_MESSAGE = "[bold cyan]Thank you for using Scribe-Data![/bold cyan]" + +# MARK: Config + + +class ScribeDataConfig: + """ + Class for the configuration of the interactive mode. + """ + + def __init__(self) -> None: + """ + Configure the interactive mode. + """ + self.languages = list_all_languages(language_metadata) + self.data_types = list(data_type_metadata.keys()) + self.selected_languages: list[str] = [] + self.selected_data_types: list[str] = [] + self.output_type: str = "json" + self.output_dir: Path = DEFAULT_JSON_EXPORT_DIR + self.overwrite: bool = False + self.configured: bool = False + self.identifier_case: str = "camel" + self.input_dir: Path = DEFAULT_JSON_EXPORT_DIR + self.output_dir_sqlite: Path = DEFAULT_SQLITE_EXPORT_DIR + + +interactive_mode_config = ScribeDataConfig() diff --git a/src/scribe_data/cli/interactive/execute.py b/src/scribe_data/cli/interactive/execute.py new file mode 100644 index 000000000..3ba7c7586 --- /dev/null +++ b/src/scribe_data/cli/interactive/execute.py @@ -0,0 +1,181 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Interactive mode execution for the Scribe-Data CLI to allow users to select request arguments. +""" + +import logging +from pathlib import Path + +import questionary +from prompt_toolkit import prompt +from rich import print as rprint +from rich.console import Console +from rich.logging import RichHandler +from rich.table import Table +from tqdm import tqdm + +from scribe_data.cli.get import get_data +from scribe_data.cli.interactive.config import ( + THANK_YOU_MESSAGE, + interactive_mode_config, +) +from scribe_data.cli.interactive.prompt import ( + prompt_for_data_types, + prompt_for_languages, +) +from scribe_data.cli.total.wrapper import total_wrapper +from scribe_data.utils import DEFAULT_WIKIDATA_DUMP_EXPORT_DIR +from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump + +# MARK: Logging + +logging.basicConfig( + level=logging.INFO, + format="%(message)s", + datefmt="[%X]", + handlers=[RichHandler(markup=True)], # Enable markup for colors +) +console = Console() +logger = logging.getLogger("rich") + +# MARK: Execute Request + + +def execute_request() -> None: + """ + Execute the interactive mode request based on current configuration. + + Returns + ------- + None + An interactive mode request is ran. + """ + if ( + not interactive_mode_config.selected_languages + or not interactive_mode_config.selected_data_types + ): + rprint("[bold red]Error: Please configure languages and data types.[/bold red]") + return + + # Calculate total operations + total_operations = len(interactive_mode_config.selected_languages) * len( + interactive_mode_config.selected_data_types + ) + + with tqdm( + total=total_operations, + desc="Exporting data", + unit="operation", + ) as pbar: + for language in interactive_mode_config.selected_languages: + for data_type in interactive_mode_config.selected_data_types: + pbar.set_description(f"Exporting {language} {data_type} data") + + try: + get_data( + languages=[language], + data_types=[data_type], + output_type=interactive_mode_config.output_type, + overwrite=interactive_mode_config.overwrite, + interactive=True, + ) + # The data was successfully written to file, so we can log success. + logger.info( + f"[green]✔ Successfully exported {language} {data_type} data.[/green]" + ) + + except Exception as e: + logger.error( + f"[red]✖ Failed to export {language} {data_type} data: {str(e)}[/red]" + ) + + pbar.update(1) + + if interactive_mode_config.overwrite: + rprint("[bold green]Data request completed successfully![/bold green]") + + +def request_total_lexeme_loop() -> None: + """ + Continuously prompts for lexeme requests until exit. + """ + while True: + choice = questionary.select( + "What would you like to do?", + choices=[ + questionary.Choice("Configure total lexemes request", "total"), + questionary.Choice("Run total lexemes request", "run"), + questionary.Choice( + "Run total lexemes request with lexeme dumps", "run_all" + ), + questionary.Choice("Exit", "exit"), + ], + ).ask() + + if choice == "run": + total_wrapper( + languages=interactive_mode_config.selected_languages, + data_types=interactive_mode_config.selected_data_types, + all_bool=False, + ) + ( + interactive_mode_config.selected_languages, + interactive_mode_config.selected_data_types, + ) = [], [] + rprint(THANK_YOU_MESSAGE) + break + + elif choice == "run_all": + if wikidata_dump_path := prompt( + f"Enter Wikidata lexeme dump path (default: {str(DEFAULT_WIKIDATA_DUMP_EXPORT_DIR)}): " + ): + wikidata_dump_path = Path(wikidata_dump_path) + + else: + wikidata_dump_path = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR + + parse_wd_lexeme_dump( + languages=interactive_mode_config.selected_languages, + wikidata_dump_path=wikidata_dump_path, + wikidata_dump_type=["total"], + interactive_mode=True, + ) + break + + elif choice == "exit": + return + + else: + prompt_for_languages() + prompt_for_data_types() + + +# MARK: Summary + + +def display_summary() -> None: + """ + Display a summary of the interactive mode request to run. + """ + table = Table( + title="Scribe-Data Request Configuration Summary", style="bright_white" + ) + + table.add_column("Setting", style="bold cyan", no_wrap=True) + table.add_column("Value(s)", style="magenta") + + table.add_row( + "Languages", + ", ".join(interactive_mode_config.selected_languages) or "None", + ) + table.add_row( + "Data Types", + ", ".join(interactive_mode_config.selected_data_types) or "None", + ) + table.add_row("Output Type", interactive_mode_config.output_type) + table.add_row("Output Directory", str(interactive_mode_config.output_dir)) + table.add_row("Overwrite", "Yes" if interactive_mode_config.overwrite else "No") + + console.print("\n") + console.print(table, justify="left") + console.print("\n") diff --git a/src/scribe_data/cli/interactive/prompt.py b/src/scribe_data/cli/interactive/prompt.py new file mode 100644 index 000000000..fdda147a9 --- /dev/null +++ b/src/scribe_data/cli/interactive/prompt.py @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Interactive mode prompting for the Scribe-Data CLI to allow users to select request arguments. +""" + +from pathlib import Path + +from prompt_toolkit import prompt +from prompt_toolkit.completion import WordCompleter +from rich import print as rprint + +from scribe_data.cli.interactive.config import interactive_mode_config +from scribe_data.utils import DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, resolve_lang_iso + +# MARK: Word Completion + + +def create_word_completer( + options: list[str], include_all: bool = False +) -> WordCompleter: + """ + Return a word completer object of the given options. + + Parameters + ---------- + options : List[str] + The options that could complete the current input. + + include_all : bool + Whether 'All' should be an option. + + Returns + ------- + WordCompleter + The word completer object from which completions can be shown to the user. + """ + if include_all: + options = ["All"] + options + + return WordCompleter(options, ignore_case=True) + + +# MARK: Languages + + +def prompt_for_languages() -> None: + """ + Request language and data type for lexeme totals. + + Returns + ------- + None + Languages are added to the configuration or are asked for. + """ + language_completer = create_word_completer( + interactive_mode_config.languages, include_all=True + ) + initial_language_selection = ", ".join(interactive_mode_config.selected_languages) + selected_languages = prompt( + "Select languages (comma-separated or 'All'): ", + default=initial_language_selection, + completer=language_completer, + ) + if "All" in selected_languages: + interactive_mode_config.selected_languages = interactive_mode_config.languages + + elif selected_languages.strip(): # check if input is not just whitespace + interactive_mode_config.selected_languages = [ + lang.strip() + for lang in selected_languages.split(",") + if lang.strip() in interactive_mode_config.languages + ] + + if not interactive_mode_config.selected_languages: + rprint("[yellow]No language selected. Please try again.[/yellow]") + return prompt_for_languages() + + +# MARK: Data Types + + +def prompt_for_data_types() -> None: + """ + Prompt the user to select data types. + + Returns + ------- + None + Data types are added to the configuration or are asked for. + """ + data_type_completer = create_word_completer( + interactive_mode_config.data_types, include_all=True + ) + initial_data_type_selection = ", ".join(interactive_mode_config.selected_data_types) + + while True: + selected_data_types = prompt( + "Select data types (comma-separated or 'All'): ", + default=initial_data_type_selection, + completer=data_type_completer, + ) + if "All" in selected_data_types.capitalize(): + interactive_mode_config.selected_data_types = ( + interactive_mode_config.data_types + ) + break + + elif selected_data_types.strip(): # check if input is not just whitespace + interactive_mode_config.selected_data_types = [ + dt.strip() + for dt in selected_data_types.split(",") + if dt.strip() in interactive_mode_config.data_types + ] + if interactive_mode_config.selected_data_types: + break # exit loop if valid data types are selected + + rprint("[yellow]No data type selected. Please try again.[/yellow]") + + +# MARK: Resolve Inputs + + +def _wiktionary_dump_search_dirs() -> list[Path]: + """ + Build an ordered list of directories to search for Wiktionary dumps. + + Each candidate directory is resolved and included only if it exists. + Duplicate paths are omitted while preserving the following search order: + + 1. The provided ``location`` directory. + 2. The default export directory (:data:`~scribe_data.utils.DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR`). + 3. The default export directory under every ancestor of the current working directory. + 4. The current working directory itself. + + Searching ancestor directories allows dumps to be found when the interactive mode + is started from a nested folder (e.g., ``scribe_data_wiktionary_json_export/spanish``). + + Returns + ------- + list[Path] + A deduplicated list of existing directories to search. + """ + candidates = [ + DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, + *(parent / DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR for parent in Path.cwd().parents), + Path.cwd(), + ] + resolved_paths = [path.expanduser().resolve() for path in candidates] + return list(dict.fromkeys(path for path in resolved_paths if path.is_dir())) + + +def resolve_wiktionary_dump_path(language: str, location: str | Path) -> Path | None: + """ + Resolve a Wiktionary dump file for the given source language. + + Locates the newest Wiktionary XML dump for the specified language. + If the ``location`` argument points directly to a file, that file is returned. + Otherwise, it searches through a prioritized list of directories for dumps + matching the ``{iso}wiktionary*pages-articles.xml*`` pattern. + + Parameters + ---------- + language : str + Source language name (e.g. ``german``). + + location : str or Path + Path to a specific dump file, or a base directory to begin searching from. + + Returns + ------- + Path or None + The path to the newest matching dump file, the explicit file if ``location`` + is a file, or ``None`` if no matching dump is found. + """ + path = Path(location).expanduser().resolve() + if path.is_file(): + return path + + if not (iso := resolve_lang_iso(language)): + return None + + dumps = [ + dump_path + for search_dir in _wiktionary_dump_search_dirs() + for dump_path in search_dir.glob(f"{iso}wiktionary*pages-articles.xml*") + ] + return ( + max(dumps, key=lambda dump_path: dump_path.stat().st_mtime).resolve() + if dumps + else None + ) diff --git a/src/scribe_data/cli/interactive/run.py b/src/scribe_data/cli/interactive/run.py new file mode 100644 index 000000000..9d37fb0db --- /dev/null +++ b/src/scribe_data/cli/interactive/run.py @@ -0,0 +1,299 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Interactive mode runner for the Scribe-Data CLI to allow users to select request arguments. +""" + +from pathlib import Path + +import questionary +from prompt_toolkit import prompt +from rich import print as rprint + +from scribe_data.cli.convert.wrapper import convert_wrapper +from scribe_data.cli.interactive.config import ( + THANK_YOU_MESSAGE, + interactive_mode_config, +) +from scribe_data.cli.interactive.execute import ( + display_summary, + execute_request, + request_total_lexeme_loop, +) +from scribe_data.cli.interactive.prompt import ( + create_word_completer, + prompt_for_data_types, + prompt_for_languages, + resolve_wiktionary_dump_path, +) +from scribe_data.utils import ( + DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, + DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, + DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, +) +from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump + +# MARK: Configure + + +def configure_settings() -> None: + """ + Configure the settings of the interactive mode request. + + Asks for: + - Languages + - Data types + - Output type + - Output directory + - Whether to overwrite + """ + rprint( + "[cyan]Follow the prompts below. Press tab for completions and enter to select.[/cyan]" + ) + prompt_for_languages() + prompt_for_data_types() + + output_type_completer = create_word_completer(["json", "csv", "tsv"]) + interactive_mode_config.output_type = prompt( + "Select output type (json/csv/tsv): ", + default="json", + completer=output_type_completer, + ) + while interactive_mode_config.output_type not in ["json", "csv", "tsv"]: + rprint("[yellow]Invalid output type selected. Please try again.[/yellow]") + interactive_mode_config.output_type = prompt( + "Select output type (json/csv/tsv): ", + default="json", + completer=output_type_completer, + ) + + if output_dir := prompt( + f"Enter output directory (default: {interactive_mode_config.output_dir}): " + ): + interactive_mode_config.output_dir = Path(output_dir) + + overwrite_completer = create_word_completer(["Y", "n"]) + overwrite = ( + prompt("Overwrite existing files? (Y/n): ", completer=overwrite_completer) + or "y" + ) + interactive_mode_config.overwrite = overwrite.lower() == "y" + + interactive_mode_config.configured = True + display_summary() + + +# MARK: Start + + +def run_interactive_mode(operation: str | None = None) -> None: + """ + Entry point for interactive mode. + + Parameters + ---------- + operation : str + The type of operation that interactive mode is being ran with. + """ + while True: + # Check if both selected_languages and selected_data_types are empty. + if ( + interactive_mode_config.selected_languages + or interactive_mode_config.selected_data_types + ): + choices = [ + questionary.Choice("Configure get data request", "configure"), + questionary.Choice("Exit", "exit"), + ] + + if interactive_mode_config.configured: + choices.insert( + 1, questionary.Choice("Run get data request with WDQS", "run") + ) + choices.insert( + 2, + questionary.Choice( + "Run get lexemes request with lexeme dumps", "run_all" + ), + ) + + elif ( + interactive_mode_config.selected_languages + and interactive_mode_config.selected_data_types + ): + choices.insert( + 1, questionary.Choice("Request for convert JSON", "convert_json") + ) + + else: + choices.insert( + 1, questionary.Choice("Request for total lexeme", "total") + ) + + elif operation == "get": + choices = [ + questionary.Choice("Configure get data request", "configure"), + # Choice("See list of languages", "languages"), + questionary.Choice("Exit", "exit"), + ] + + elif operation == "total": + choices = [ + questionary.Choice("Configure total lexemes request", "total"), + # Choice("See list of languages", "languages"), + questionary.Choice("Exit", "exit"), + ] + + elif operation == "convert": + choices = [ + questionary.Choice("Configure convert request", "convert"), + questionary.Choice("Exit", "exit"), + ] + + elif operation == "translations": + choices = [ + questionary.Choice("Configure translations request", "translations"), + # Choice("See list of languages", "languages"), + questionary.Choice("Exit", "exit"), + ] + + choice = questionary.select("What would you like to do?", choices=choices).ask() + + if choice == "configure": + configure_settings() + + elif choice == "run_all": + if wikidata_dump_path := prompt( + f"Enter Wikidata lexeme dump path (default: {str(DEFAULT_WIKIDATA_DUMP_EXPORT_DIR)}): " + ): + wikidata_dump_path = Path(wikidata_dump_path) + + else: + wikidata_dump_path = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR + + parse_wd_lexeme_dump( + languages=interactive_mode_config.selected_languages, + data_types=interactive_mode_config.selected_data_types, + wikidata_dump_type=["form"], + output_dir=interactive_mode_config.output_dir, + wikidata_dump_path=wikidata_dump_path, + overwrite_all=interactive_mode_config.overwrite, + interactive_mode=True, + ) + rprint(THANK_YOU_MESSAGE) + break + + elif choice == "total": + prompt_for_languages() + prompt_for_data_types() + request_total_lexeme_loop() + break + + elif choice == "convert": + prompt_for_languages() + prompt_for_data_types() + + # Use the default explicitly so that if the user enters nothing, the default value is retained. + user_input_dir = prompt( + f"Enter input directory (default: {interactive_mode_config.input_dir}): ", + default=str(interactive_mode_config.input_dir), + ) + interactive_mode_config.input_dir = Path(user_input_dir) + + user_output_dir = prompt( + f"Enter output directory (default: {interactive_mode_config.output_dir_sqlite}): ", + default=str(interactive_mode_config.output_dir_sqlite), + ) + interactive_mode_config.output_dir_sqlite = Path(user_output_dir) + + identifier_case = prompt( + "Enter identifier case (default: camel): ", + default="camel", + ) + output_type = prompt( + "Enter output type (default: sqlite): ", + default="sqlite", + ) + overwrite_str = prompt( + "Overwrite existing files? (default: False): ", + default="False", + ) + overwrite_bool = overwrite_str.strip().lower() in ("true", "y", "yes") + + convert_wrapper( + languages=interactive_mode_config.selected_languages, + data_types=interactive_mode_config.selected_data_types, + input_path=interactive_mode_config.input_dir, # Use the updated configuration value + output_type=output_type, + identifier_case=identifier_case, + overwrite=overwrite_bool, + ) + break + + elif choice == "translations": + from scribe_data.wiktionary.parse_translations import ( + parse_wiktionary_translations, + ) + + while True: + wiktionary_dump_language = prompt( + "Select Wiktionary dump source language: ", + default="english", + completer=create_word_completer(interactive_mode_config.languages), + ).strip() + if wiktionary_dump_language in interactive_mode_config.languages: + break + rprint( + f"[bold red]Error: {wiktionary_dump_language} is not a valid language.[/bold red]" + ) + + dump_location = prompt( + "Enter Wiktionary dump directory or file path " + f"(default: {DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR}): ", + default=str(DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR), + ) + wiktionary_dump_path = resolve_wiktionary_dump_path( + wiktionary_dump_language, + dump_location, + ) + if not wiktionary_dump_path: + rprint( + f"[bold red]No {wiktionary_dump_language} Wiktionary dump found at " + f"{dump_location}.[/bold red]" + ) + break + + prompt_for_languages() + + translations_output_dir = prompt( + "Enter output directory " + f"(default: {DEFAULT_WIKTIONARY_JSON_EXPORT_DIR}): ", + default=str(DEFAULT_WIKTIONARY_JSON_EXPORT_DIR), + ) + + overwrite_str = prompt( + "Overwrite existing files? (default: False): ", + default="False", + ) + overwrite_bool = overwrite_str.strip().lower() in ("true", "y", "yes") + + parse_wiktionary_translations( + target_languages=interactive_mode_config.selected_languages, + wiktionary_dump_path=Path(wiktionary_dump_path), + output_dir=Path(translations_output_dir), + overwrite=overwrite_bool, + ) + + break + + elif choice == "run": + execute_request() + rprint(THANK_YOU_MESSAGE) + break + + else: + rprint(THANK_YOU_MESSAGE) + break + + +if __name__ == "__main__": + run_interactive_mode() diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py deleted file mode 100644 index 5a17b18ae..000000000 --- a/src/scribe_data/cli/list.py +++ /dev/null @@ -1,205 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later -""" -Functions for listing languages and data types for the Scribe-Data CLI. -""" - -import os -from pathlib import Path - -from scribe_data.utils import ( - WIKIDATA_QUERIES_ALL_DATA_DIR, - format_sublanguage_name, - get_language_iso, - get_language_qid, - language_map, - language_metadata, - list_all_languages, -) - - -def list_languages() -> None: - """ - Generate a table of languages with their ISO-2 codes and Wikidata QIDs. - - Returns - ------- - None - A table of all languages with their ISO-2 codes and Wikidata QIDs is printed. - """ - languages = list_all_languages(language_metadata) - - language_col_width = max(len(lang) for lang in languages) + 2 - iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2 - qid_col_width = max(len(get_language_qid(lang)) for lang in languages) + 2 - - table_line_length = language_col_width + iso_col_width + qid_col_width - - print( - f"{'\nLanguage':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}" - ) - print("=" * table_line_length) - - for lang in languages: - print( - f"{lang.title():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}" - ) - - print() - - -def list_data_types(language: str = "") -> None: - """ - List all data types or those available for a given language. - - Parameters - ---------- - language : str - The language to potentially list data types for. - """ - languages = list_all_languages(language_metadata) - if language: - language = format_sublanguage_name(language, language_metadata) - language_data = language_map.get(language.lower()) - language_dir = WIKIDATA_QUERIES_ALL_DATA_DIR / language.lower() - - if not language_data: - raise ValueError(f"Language '{language.capitalize()}' is not recognized.") - - data_types = {f.name for f in language_dir.iterdir() if f.is_dir()} - - # Add emoji keywords if available. - iso = get_language_iso(language=language) - path_to_cldr_annotations = ( - Path(__file__).parent.parent - / "unicode" - / "cldr-annotations-full" - / "annotations" - ) - if iso in os.listdir(path_to_cldr_annotations): - data_types.add("emoji-keywords") - - if not data_types: - raise ValueError( - f"No data types available for language '{language.capitalize()}'." - ) - - table_header = f"Available data types: {language.capitalize()}" - - else: - data_types = set() - for lang in languages: - language_dir = WIKIDATA_QUERIES_ALL_DATA_DIR / format_sublanguage_name( - lang, language_metadata - ) - if language_dir.is_dir(): - data_types.update(f.name for f in language_dir.iterdir() if f.is_dir()) - - data_types.add("emoji-keywords") - - table_header = "Available data types: All languages" - - table_line_length = max(len(table_header), max(len(dt) for dt in data_types)) - - print() - print(table_header) - print("=" * table_line_length) - - data_types = sorted(data_types) - for dt in data_types: - print(dt.replace("_", "-")) - - print() - - -def list_all() -> None: - """ - List all available languages and data types. - - Returns - ------- - None - All available languages and data types are listed. - """ - list_languages() - list_data_types() - - -def list_languages_for_data_type(data_type: str) -> None: - """ - List the available languages for a given data type. - - Parameters - ---------- - data_type : str - The data type to check for. - - Returns - ------- - None - A list of languages for data types is printed to the terminal. - """ - list_languages() - # corrected_data_type = correct_data_type(data_type=data_type) - # all_languages = list_languages_with_metadata_for_data_type(language_metadata) - - # # Set column widths for consistent formatting. - # language_col_width = max(len(lang["name"]) for lang in all_languages) + 2 - # iso_col_width = max(len(lang["iso"]) for lang in all_languages) + 2 - # qid_col_width = max(len(lang["qid"]) for lang in all_languages) + 2 - - # table_line_length = language_col_width + iso_col_width + qid_col_width - - # # Print table header. - # print( - # f"{'\nLanguage':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}" - # ) - # print("=" * table_line_length) - - # # Iterate through the list of languages and format each row. - # for lang in all_languages: - # print( - # f"{lang['name'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}" - # ) - - # print() - - -def list_wrapper( - language: str = "", data_type: str = "", all_bool: bool = False -) -> None: - """ - Conditionally provides the full functionality of the list command. - - Parameters - ---------- - language : str - The language to potentially list data types for. - - data_type : str - The data type to check for. - - all_bool : bool - Whether all languages and data types should be listed. - - Returns - ------- - None - The call to list functions based on the provided arguments. - """ - if (not language and not data_type) or all_bool: - list_all() - - elif language is True and not data_type: - list_languages() - - elif not language and data_type is True: - list_data_types() - - elif language is True and data_type is True: - print("Please specify either a language or a data type.") - - elif language is True and data_type is not None: - list_languages_for_data_type(data_type) - - elif language is not None and data_type is True: - list_data_types(language) diff --git a/src/scribe_data/cli/list/__init__.py b/src/scribe_data/cli/list/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/scribe_data/cli/list/data_types.py b/src/scribe_data/cli/list/data_types.py new file mode 100644 index 000000000..b1b9908c5 --- /dev/null +++ b/src/scribe_data/cli/list/data_types.py @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Functions for listing data types for the Scribe-Data CLI. +""" + +import os +from pathlib import Path + +from scribe_data.utils import ( + WIKIDATA_QUERIES_ALL_DATA_DIR, + format_sublanguage_name, + get_language_iso, + language_map, + language_metadata, + list_all_languages, +) + +# MARK: Data Types + + +def list_data_types(language: str = "") -> None: + """ + List all data types or those available for a given language. + + Parameters + ---------- + language : str + The language to potentially list data types for. + """ + languages = list_all_languages(language_metadata) + if language: + language = format_sublanguage_name(language, language_metadata) + language_data = language_map.get(language.lower()) + language_dir = WIKIDATA_QUERIES_ALL_DATA_DIR / language.lower() + + if not language_data: + raise ValueError(f"Language '{language.capitalize()}' is not recognized.") + + data_types = {f.name for f in language_dir.iterdir() if f.is_dir()} + + # Add emoji keywords if available. + iso = get_language_iso(language=language) + path_to_cldr_annotations = ( + Path(__file__).parent.parent + / "unicode" + / "cldr-annotations-full" + / "annotations" + ) + if iso in os.listdir(path_to_cldr_annotations): + data_types.add("emoji-keywords") + + if not data_types: + raise ValueError( + f"No data types available for language '{language.capitalize()}'." + ) + + table_header = f"Available data types: {language.capitalize()}" + + else: + data_types = set() + for lang in languages: + language_dir = WIKIDATA_QUERIES_ALL_DATA_DIR / format_sublanguage_name( + lang, language_metadata + ) + if language_dir.is_dir(): + data_types.update(f.name for f in language_dir.iterdir() if f.is_dir()) + + data_types.add("emoji-keywords") + + table_header = "Available data types: All languages" + + table_line_length = max(len(table_header), max(len(dt) for dt in data_types)) + + print() + print(table_header) + print("=" * table_line_length) + + data_types = sorted(data_types) + for dt in data_types: + print(dt.replace("_", "-")) + + print() diff --git a/src/scribe_data/cli/list/languages.py b/src/scribe_data/cli/list/languages.py new file mode 100644 index 000000000..1dbc7d770 --- /dev/null +++ b/src/scribe_data/cli/list/languages.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Functions for listing languages for the Scribe-Data CLI. +""" + +from scribe_data.utils import ( + get_language_iso, + get_language_qid, + language_metadata, + list_all_languages, +) + +# MARK: Languages + + +def list_languages() -> None: + """ + Generate a table of languages with their ISO-2 codes and Wikidata QIDs. + + Returns + ------- + None + A table of all languages with their ISO-2 codes and Wikidata QIDs is printed. + """ + languages = list_all_languages(language_metadata) + + language_col_width = max(len(lang) for lang in languages) + 2 + iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2 + qid_col_width = max(len(get_language_qid(lang)) for lang in languages) + 2 + + table_line_length = language_col_width + iso_col_width + qid_col_width + + print( + f"{'\nLanguage':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}" + ) + print("=" * table_line_length) + + for lang in languages: + print( + f"{lang.title():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}" + ) + + print() + + +def list_languages_for_data_type(data_type: str) -> None: + """ + List the available languages for a given data type. + + Parameters + ---------- + data_type : str + The data type to check for. + + Returns + ------- + None + A list of languages for data types is printed to the terminal. + """ + list_languages() diff --git a/src/scribe_data/cli/list/wrapper.py b/src/scribe_data/cli/list/wrapper.py new file mode 100644 index 000000000..0b701da72 --- /dev/null +++ b/src/scribe_data/cli/list/wrapper.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Functions for listing languages and data types for the Scribe-Data CLI. +""" + +from scribe_data.cli.list.data_types import list_data_types +from scribe_data.cli.list.languages import list_languages, list_languages_for_data_type + +# MARK: All + + +def list_all() -> None: + """ + List all available languages and data types. + + Returns + ------- + None + All available languages and data types are listed. + """ + list_languages() + list_data_types() + + +# MARK: Wrapper + + +def list_wrapper( + language: str = "", data_type: str = "", all_bool: bool = False +) -> None: + """ + Conditionally provides the full functionality of the list command. + + Parameters + ---------- + language : str + The language to potentially list data types for. + + data_type : str + The data type to check for. + + all_bool : bool + Whether all languages and data types should be listed. + + Returns + ------- + None + The call to list functions based on the provided arguments. + """ + if (not language and not data_type) or all_bool: + list_all() + + elif language is True and not data_type: + list_languages() + + elif not language and data_type is True: + list_data_types() + + elif language is True and data_type is True: + print("Please specify either a language or a data type.") + + # Note: Saved for if listing languages by data type is implemented. + elif language is True and data_type is not None: + list_languages_for_data_type(data_type) + + elif language is not None and data_type is True: + list_data_types(language) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 24ae81240..462fe9a72 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -5,41 +5,28 @@ #!/usr/bin/env python3 import argparse -from pathlib import Path from questionary import select, text from rich import print as rprint from scribe_data.cli.cli_utils import validate_language_and_data_type -from scribe_data.cli.contracts.check import check_contracts +from scribe_data.cli.contracts.check import check_contract_data_print_missing from scribe_data.cli.contracts.export import export_contracts from scribe_data.cli.contracts.filter import export_data_filtered_by_contracts -from scribe_data.cli.convert import convert_wrapper -from scribe_data.cli.download import ( - download_wiktionary_dumps, +from scribe_data.cli.convert.wrapper import convert_wrapper +from scribe_data.cli.download.wikidata_lexeme_dump import ( wd_lexeme_dump_download_wrapper, ) +from scribe_data.cli.download.wiktionary_dump import download_wiktionary_dumps from scribe_data.cli.get import get_data -from scribe_data.cli.interactive import start_interactive_mode -from scribe_data.cli.list import list_wrapper -from scribe_data.cli.total import total_wrapper +from scribe_data.cli.interactive.run import run_interactive_mode +from scribe_data.cli.list.wrapper import list_wrapper +from scribe_data.cli.total.wrapper import total_wrapper from scribe_data.cli.upgrade import upgrade_cli from scribe_data.cli.version import get_version_message -from scribe_data.utils import ( - DEFAULT_CONTRACTS_EXPORT_DIR, - DEFAULT_CSV_EXPORT_DIR, - DEFAULT_JSON_EXPORT_DIR, - DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, -) -LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for." -GET_DESCRIPTION = ( - "Get data from Wikidata and other sources for the given languages and data types." -) -TOTAL_DESCRIPTION = "Check Wikidata for the total available data for the given languages and data types." -CONVERT_DESCRIPTION = "Convert data returned by Scribe-Data to different file types." CLI_EPILOG = "Visit the codebase at https://github.com/scribe-org/Scribe-Data and documentation at https://scribe-data.readthedocs.io to learn more!" +CLI_HELP_MSG = "Show this help message and exit." def main() -> None: @@ -54,13 +41,13 @@ def main() -> None: # MARK: CLI Base parser = argparse.ArgumentParser( - description="The Scribe-Data CLI is a tool for extracting language data from Wikidata and other sources.", + description="The Scribe-Data CLI is a tool for extracting language data from Wikidata, Wiktionary and other sources.", epilog=CLI_EPILOG, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=30), ) subparsers = parser.add_subparsers(dest="command") - parser._actions[0].help = "Show this help message and exit." + parser._actions[0].help = CLI_HELP_MSG parser.add_argument( "-v", @@ -76,7 +63,9 @@ def main() -> None: help="Upgrade the Scribe-Data CLI to the latest version.", ) - # MARK: List + # MARK: List Args + + LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for." list_parser = subparsers.add_parser( "list", @@ -87,20 +76,20 @@ def main() -> None: formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) - list_parser._actions[0].help = "Show this help message and exit." + list_parser._actions[0].help = CLI_HELP_MSG list_parser.add_argument( "-lang", "--language", - nargs="?", const=True, + nargs="?", help="List options for all or given languages.", ) list_parser.add_argument( "-dt", "--data-type", - nargs="?", const=True, + nargs="?", help="List options for all or given data types (e.g., nouns, verbs).", ) list_parser.add_argument( @@ -110,7 +99,9 @@ def main() -> None: help="List all languages and data types.", ) - # MARK: Get + # MARK: Get Args + + GET_DESCRIPTION = "Get data from Wikidata, Wiktionary and other sources for the given languages and data types." get_parser = subparsers.add_parser( "get", @@ -121,7 +112,7 @@ def main() -> None: formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) - get_parser._actions[0].help = "Show this help message and exit." + get_parser._actions[0].help = CLI_HELP_MSG get_parser.add_argument( "-lang", @@ -144,12 +135,6 @@ def main() -> None: choices=["json", "csv", "tsv", "sqlite"], help="The output file type.", ) - get_parser.add_argument( - "-od", - "--output-dir", - type=str, - help=f"The output directory path for results. Default: ./{DEFAULT_JSON_EXPORT_DIR} for JSON; ./{DEFAULT_CSV_EXPORT_DIR} for CSV, etc.", - ) get_parser.add_argument( "-ope", "--outputs-per-entry", @@ -162,18 +147,6 @@ def main() -> None: action="store_true", help="Whether to overwrite existing files (default: False).", ) - get_parser.add_argument( - "-a", - "--all", - action=argparse.BooleanOptionalAction, - help="Get all languages and data types.", - ) - get_parser.add_argument( - "-i", - "--interactive", - action="store_true", - help="Run Scribe-Data in interactive mode to choose your commands from an helpful terminal interface", - ) get_parser.add_argument( "-ic", "--identifier-case", @@ -183,21 +156,21 @@ def main() -> None: help="The case format for identifiers in the output data (default: camel).", ) get_parser.add_argument( - "-wdp", - "--wikidata-dump-path", - nargs="?", - const=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - help=f"The output directory path for the downloaded Wikidata dump. Uses default directory ./{DEFAULT_WIKIDATA_DUMP_EXPORT_DIR} if no path provided.", + "-a", + "--all", + action=argparse.BooleanOptionalAction, + help="Get all languages and data types.", ) get_parser.add_argument( - "-wtp", - "--wiktionary-dump-path", - nargs="?", - const=DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, - help=f"Path to download *wiktionary-*-pages-articles.xml.bz2 Wiktionary dumps for translations. Uses default directory ./{DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR} if no path provided.", + "-i", + "--interactive", + action="store_true", + help="Run Scribe-Data in interactive mode to choose your commands from an helpful terminal interface", ) - # MARK: Total + # MARK: Total Args + + TOTAL_DESCRIPTION = "Check Wikidata for the total available data for the given languages and data types." total_parser = subparsers.add_parser( "total", @@ -208,7 +181,7 @@ def main() -> None: formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) - total_parser._actions[0].help = "Show this help message and exit." + total_parser._actions[0].help = CLI_HELP_MSG total_parser.add_argument( "-lang", "--language", type=str, help="The language(s) to check totals for." @@ -223,20 +196,17 @@ def main() -> None: "-a", "--all", action=argparse.BooleanOptionalAction, - help="Check for all languages and data types.", + help="Check totals for all languages and data types.", ) total_parser.add_argument( "-i", "--interactive", action="store_true", help="Run in interactive mode" ) - total_parser.add_argument( - "-wdp", - "--wikidata-dump-path", - nargs="?", - const=True, - help=f"Path to a local Wikidata lexemes dump for running with '--all'. Uses default directory ./{DEFAULT_WIKIDATA_DUMP_EXPORT_DIR} if no path provided.", - ) - # MARK: Convert + # MARK: Convert Args + + CONVERT_DESCRIPTION = ( + "Convert data returned by Scribe-Data to different file types." + ) convert_parser = subparsers.add_parser( "convert", @@ -247,7 +217,7 @@ def main() -> None: formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) - convert_parser._actions[0].help = "Show this help message and exit." + convert_parser._actions[0].help = CLI_HELP_MSG convert_parser.add_argument( "-lang", @@ -255,7 +225,7 @@ def main() -> None: type=str, required=False, nargs="+", - help="The language of the file to convert.", + help="The language(s) of the file to convert.", ) convert_parser.add_argument( "-dt", @@ -265,14 +235,6 @@ def main() -> None: nargs="+", help="The data type(s) of the file to convert (e.g., nouns, verbs).", ) - convert_parser.add_argument( - "-if", - "--input-file", - type=Path, - required=False, - default=None, - help="The path to the input file to convert.", - ) convert_parser.add_argument( "-ot", "--output-type", @@ -281,13 +243,6 @@ def main() -> None: default="False", help="The output file type.", ) - convert_parser.add_argument( - "-od", - "--output-dir", - type=str, - default=None, - help="The directory where the output file will be saved.", - ) convert_parser.add_argument( "-o", "--overwrite", @@ -319,51 +274,40 @@ def main() -> None: "-i", "--interactive", action="store_true", help="Run in interactive mode" ) - # MARK: Download + # MARK: Download Args + + DOWNLOAD_DESCRIPTION = ( + "Download Wikidata lexeme or Wiktionary dumps from dumps.wikimedia.org." + ) download_parser = subparsers.add_parser( "download", aliases=["d"], - help="Download Wikidata lexeme or Wiktionary dumps.", - description="Download Wikidata lexeme or Wiktionary dumps from dumps.wikimedia.org.", + help=DOWNLOAD_DESCRIPTION, + description=DOWNLOAD_DESCRIPTION, epilog=CLI_EPILOG, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) - download_parser._actions[0].help = "Show this help message and exit." + download_parser._actions[0].help = CLI_HELP_MSG download_parser.add_argument( "-lang", "--language", type=str, - help="Target language or ISO code for Wiktionary dumps to download.", nargs="+", - ) - download_parser.add_argument( - "-wdp", - "--wikidata-dump-path", - type=str, - nargs="?", - const=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - help=f"The output directory path for the downloaded Wikidata dump. Uses default directory ./{DEFAULT_WIKIDATA_DUMP_EXPORT_DIR} if no path provided.", - ) - download_parser.add_argument( - "-wtp", - "--wiktionary-dump-path", - nargs="?", - const=DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, - help=f"Path to download *wiktionary-*-pages-articles.xml.bz2 Wiktionary dumps for translations. Uses default directory ./{DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR} if no path provided.", + help="Target language or ISO code for the Wiktionary dump(s) to download.", ) download_parser.add_argument( "-ds", "--dump-snapshot", type=str, - nargs="?", default="latest", + nargs="?", help="The desired snapshot of a Wikidata or Wiktionary dump (default 'latest'). Optionally specify date in YYYYMMDD format.", ) - # MARK: Interactive + # MARK: Interactive Args interactive_parser = subparsers.add_parser( "interactive", @@ -372,86 +316,49 @@ def main() -> None: description="Run in interactive mode.", ) - interactive_parser._actions[0].help = "Show this help message and exit." + interactive_parser._actions[0].help = CLI_HELP_MSG - # MARK: Export Contracts + # MARK: Contract Args + + EXPORT_CONTRACTS_DESCRIPTION = ( + "Export Scribe-Data contracts to the current working directory." + ) export_contracts_parser = subparsers.add_parser( "export_contracts", aliases=["ec"], - help="Export Scribe-Data contracts to a local directory.", - description="Export Scribe-Data contracts to the current working directory.", + help=EXPORT_CONTRACTS_DESCRIPTION, + description=EXPORT_CONTRACTS_DESCRIPTION, epilog=CLI_EPILOG, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) - export_contracts_parser._actions[0].help = "Show this help message and exit." - export_contracts_parser.add_argument( - "-od", - "--output-dir", - type=str, - required=False, - default=DEFAULT_CONTRACTS_EXPORT_DIR, - help="The directory to export contracts to (default: current scribe_data_contracts).", - ) + export_contracts_parser._actions[0].help = CLI_HELP_MSG - # MARK: Check Contracts + CHECK_CONTRACTS_DESCRIPTION = "Check the data in a Scribe-Data export directory against data contracts to see that all needed language data is included." check_contracts_parser = subparsers.add_parser( "check_contracts", aliases=["cc"], - help="Check the data in a Scribe-Data export directory to see that all needed language data is included.", - description="Check if data exports match their corresponding data contracts.", + help=CHECK_CONTRACTS_DESCRIPTION, + description=CHECK_CONTRACTS_DESCRIPTION, epilog=CLI_EPILOG, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) - check_contracts_parser._actions[0].help = "Show this help message and exit." - check_contracts_parser.add_argument( - "-cd", - "--contracts-dir", - type=str, - required=False, - help="The directory where the contracts are saved.", - ) - check_contracts_parser.add_argument( - "-od", - "--output-dir", - type=str, - required=False, - help="The directory with the data that the contracts should be checked against.", - ) + check_contracts_parser._actions[0].help = CLI_HELP_MSG - # MARK: Filter by Contracts + FILTER_BY_CONTRACTS_DESCRIPTION = ( + "Filter exported Scribe-Data data based on provided data contracts." + ) filter_data_parser = subparsers.add_parser( "filter_data", aliases=["fd"], - help="Filter exported Scribe-Data data based on provided data contract values.", - description="Convert exported data into a dataset that only includes data within contract values.", + help=FILTER_BY_CONTRACTS_DESCRIPTION, + description=FILTER_BY_CONTRACTS_DESCRIPTION, epilog=CLI_EPILOG, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) - filter_data_parser._actions[0].help = "Show this help message and exit." - filter_data_parser.add_argument( - "-cd", - "--contracts-dir", - type=str, - required=False, - help="The directory where the data contracts are saved.", - ) - filter_data_parser.add_argument( - "-id", - "--input-dir", - type=str, - required=False, - help="The directory with the data that should be filtered.", - ) - filter_data_parser.add_argument( - "-od", - "--output-dir", - type=str, - required=False, - help="The directory to export data filtered by contracts to.", - ) + filter_data_parser._actions[0].help = CLI_HELP_MSG # MARK: Setup CLI @@ -488,14 +395,18 @@ def main() -> None: print(f"Input validation failed with error: {e}") return + # MARK: List + if args.command in ["list", "l"]: list_wrapper( language=args.language, data_type=args.data_type, all_bool=args.all ) + # MARK: Get + elif args.command in ["get", "g"]: if args.interactive: - start_interactive_mode(operation="get") + run_interactive_mode(operation="get") return else: @@ -524,7 +435,6 @@ def main() -> None: languages=[language], data_types=[data_type], output_type=args.output_type, - output_dir=args.output_dir, outputs_per_entry=args.outputs_per_entry, overwrite=args.overwrite, all_bool=args.all, @@ -539,7 +449,6 @@ def main() -> None: languages=languages or None, data_types=data_types or None, output_type=args.output_type, - output_dir=args.output_dir, outputs_per_entry=args.outputs_per_entry, overwrite=args.overwrite, all_bool=args.all, @@ -548,9 +457,11 @@ def main() -> None: wiktionary_dump=args.wiktionary_dump_path, ) + # MARK: Total + elif args.command in ["total", "t"]: if args.interactive: - start_interactive_mode(operation="total") + run_interactive_mode(operation="total") else: total_wrapper( @@ -564,9 +475,11 @@ def main() -> None: wikidata_dump=args.wikidata_dump_path, ) + # MARK: Convert + elif args.command in ["convert", "c"]: if args.interactive: - start_interactive_mode(operation="convert") + run_interactive_mode(operation="convert") return # Handle language(s) - could be string or list. @@ -590,18 +503,18 @@ def main() -> None: languages=languages, data_types=data_types, input_path=args.input_file, - output_dir=args.output_dir, output_type=args.output_type, overwrite=args.overwrite, identifier_case=args.identifier_case, all=args.all, ) + # MARK: Download + elif args.command in ["download", "d"]: if getattr(args, "wiktionary_dump_path", False): download_wiktionary_dumps( dump_snapshot=args.dump_snapshot, - output_dir=args.wiktionary_dump_path, **( dict(language_isos=args.language) if args.language is not None @@ -612,7 +525,6 @@ def main() -> None: elif getattr(args, "wikidata_dump_path", False): wd_lexeme_dump_download_wrapper( dump_snapshot=args.dump_snapshot, - output_dir=args.wikidata_dump_path, ) else: @@ -620,6 +532,8 @@ def main() -> None: "[bold red]Please indicate if a Wikidata or Wiktionary dump should be downloaded by passing the -wdp or -wtp arguments respectively.[/bold red]" ) + # MARK: Interactive + elif args.command in ["interactive", "i"]: rprint( f"[bold cyan]Welcome to {get_version_message()} interactive mode![/bold cyan]" @@ -641,40 +555,37 @@ def main() -> None: wd_lexeme_dump_download_wrapper() elif action == "Download a Wiktionary dump": - lang = text( + if lang := text( "Which language dump do you want to download?", default="en", - ).ask() - if lang: + ).ask(): download_wiktionary_dumps(language_isos=[lang]) elif action == "Check for totals": - start_interactive_mode(operation="total") + run_interactive_mode(operation="total") elif action == "Get data": - start_interactive_mode(operation="get") + run_interactive_mode(operation="get") elif action == "Get translations": - start_interactive_mode(operation="translations") + run_interactive_mode(operation="translations") elif action == "Convert JSON": - start_interactive_mode(operation="convert") + run_interactive_mode(operation="convert") else: print("Skipping action") + # MARK: Contracts + elif args.command in ["export_contracts", "ec"]: - export_contracts(output_dir=args.output_dir) + export_contracts() elif args.command in ["check_contracts", "cc"]: - check_contracts(output_dir=args.output_dir) + check_contract_data_print_missing(contracts_dir=args.contracts_dir) elif args.command in ["filter_data", "fd"]: - export_data_filtered_by_contracts( - contracts_dir=args.contracts_dir, - input_dir=args.input_dir, - output_dir=args.output_dir, - ) + export_data_filtered_by_contracts(contracts_dir=args.contracts_dir) else: parser.print_help() diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py deleted file mode 100644 index b78ad3e25..000000000 --- a/src/scribe_data/cli/total.py +++ /dev/null @@ -1,450 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later -""" -Functions to check the total language data available on Wikidata. -""" - -from http.client import IncompleteRead -from pathlib import Path -from typing import Any, cast -from urllib.error import HTTPError - -from SPARQLWrapper import JSON - -from scribe_data.utils import ( - DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - WIKIDATA_QUERIES_ALL_DATA_DIR, - check_qid_is_language, - data_type_metadata, - format_sublanguage_name, - language_metadata, - language_to_qid, - list_all_languages, -) -from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump, sparql - - -def get_qid_by_input(input_str: str | None) -> str | None: - """ - Retrieve the QID for a given language or data type input string. - - Parameters - ---------- - input_str : str, optional - The input string representing a language or data type. - - Returns - ------- - str | None - The QID corresponding to the input string, or- None if not found. - """ - if input_str: - if input_str in language_to_qid: - return language_to_qid[input_str] - - elif input_str in data_type_metadata: - return data_type_metadata[input_str] - - return None - - -def get_datatype_list(language: str) -> list | dict: - """ - Get the data types for a given language based on the project directory structure. - - Parameters - ---------- - language : str - The language to return data types for. - - Returns - ------- - list | dict - A list of the corresponding data types. - """ - language_key = language.strip().lower() # normalize input - languages = list_all_languages(language_metadata) - - # Adjust language_key for sub-languages using the format_sublanguage_name function. - formatted_language = format_sublanguage_name(language_key, language_metadata) - language_key = formatted_language.split(" ")[ - 0 - ].lower() # use the main language part if formatted - - if language_key in languages: - if "sub_languages" in language_metadata[language_key]: - sub_languages = language_metadata[language_key]["sub_languages"] - data_types = [] - - for sub_lang_key in sub_languages: - sub_lang_dir = ( - WIKIDATA_QUERIES_ALL_DATA_DIR / sub_languages[sub_lang_key]["iso"] - ) - if sub_lang_dir.exists(): - data_types.extend( - [f.name for f in sub_lang_dir.iterdir() if f.is_dir()] - ) - - if not data_types: - raise ValueError( - f"No data types available for sub-languages of '{formatted_language.capitalize()}'." - ) - - return sorted(set(data_types)) # remove duplicates and sort - - else: - language_dir = WIKIDATA_QUERIES_ALL_DATA_DIR / language_key - if not language_dir.exists(): - raise ValueError(f"Directory '{language_dir}' does not exist.") - - data_types = [f.name for f in language_dir.iterdir() if f.is_dir()] - - if not data_types: - raise ValueError( - f"No data types available for language '{formatted_language.capitalize()}'." - ) - - return sorted(data_types) - - else: # return all data types - return data_type_metadata - - -# MARK: Print - - -def print_total_lexemes(language: str | None = None) -> None: - """ - Print the total number of available entities for all data types. - - Parameters - ---------- - language : str, optional - The language to display data type entity counts for. - - Returns - ------- - str - A formatted string indicating the language, data type, and total number of lexemes for all the languages, if found. - """ - if language is None: - print("Returning total counts for all languages and data types...\n") - - elif ( - isinstance(language, str) - and language.startswith("Q") - and language[1:].isdigit() - ): - print( - f"Wikidata QID {language.capitalize()} passed. Checking validity and then all data types." - ) - language = check_qid_is_language(qid=language) - - else: - print(f"Returning total counts for {language.capitalize()} data types...\n") - - def print_total_header(language: str, dt: str, total_lexemes: str) -> None: - """ - Print the header of the total command output. - - Parameters - ---------- - language : str - The language for which to count lexemes. - - dt : str - The data type (e.g., "nouns", "verbs") for which to count lexemes. - - total_lexemes : str - The total number of lexemes derived formatted as a string. - - Returns - ------- - None - A message is printed to the terminal about the total number of lexemes. - """ - language_display = ( - "All Languages" if language is None else language.capitalize() - ) - print(f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}") - print("=" * 70) - print(f"{language_display:<20} {dt.replace('_', '-'): <25} {total_lexemes:<25}") - - if language is None: # all languages - languages = list_all_languages(language_metadata) - - for lang in languages: - data_types = get_datatype_list(lang) - - first_row = True - for dt in data_types: - total_lexemes = get_total_lexemes( - language=lang, data_type=dt, do_print=False - ) - total_lexemes = f"{total_lexemes:,}" - if first_row: - print_total_header(lang, dt, total_lexemes) - first_row = False - - else: - print(f"{'':<20} {dt.replace('_', ' '): <25} {total_lexemes:<25}") - - print() - - else: # individual language - first_row = True - if language.startswith("Q") and language[1:].isdigit(): - data_types = data_type_metadata - for t in ["emoji_keywords"]: - if t in data_types: - del data_types[t] - - else: - data_types = get_datatype_list(language) - - for dt in data_types: - total_lexemes = get_total_lexemes( - language=language, data_type=dt, do_print=False - ) - total_lexemes = f"{total_lexemes:,}" - if first_row: - print_total_header(language, dt, total_lexemes) - first_row = False - - else: - print(f"{'':<20} {dt.replace('_', ' '): <25} {total_lexemes:<25}") - - print() - - -# MARK: Get Total - - -def get_total_lexemes( - language: str, data_type: str, do_print: bool = True -) -> int | None: - """ - Get the total number of lexemes for a given language and data type from Wikidata. - - Parameters - ---------- - language : str - The language for which to count lexemes. - - data_type : str - The data type (e.g., "nouns", "verbs") for which to count lexemes. - - do_print : bool - Print the total lexemes for the given language and data type. - - Returns - ------- - str - A formatted string indicating the language, data type and total number of lexemes, if found. - """ - if ( - language is not None - and (language.startswith("Q") or language.startswith("q")) - and language[1:].isdigit() - ): - language_qid = language.capitalize() - - else: - language_qid = get_qid_by_input(language) - - if ( - data_type is not None - and (data_type.startswith("Q") or data_type.startswith("q")) - and data_type[1:].isdigit() - ): - data_type_qid = data_type.capitalize() - - else: - data_type_qid = get_qid_by_input(data_type) - - # MARK: Construct Query - - query_template = """ - SELECT - (COUNT(DISTINCT ?lexeme) as ?total) - - WHERE {{ - ?lexeme a ontolex:LexicalEntry . - {language_filter} - {data_type_filter} - }} - """ - - language_filter = ( - f"?lexeme dct:language wd:{language_qid} ." - if language_qid - else "?lexeme dct:language ?language ." - ) - - data_type_filter = ( - f"?lexeme wikibase:lexicalCategory wd:{data_type_qid} ." - if data_type_qid - else "?lexeme wikibase:lexicalCategory ?category ." - ) - - query = query_template.format( - language_filter=language_filter, data_type_filter=data_type_filter - ) - - # MARK: Query Results - - sparql.setQuery(query) - sparql.setReturnFormat(JSON) - try_count = 0 - max_retries = 2 - results = None - - while try_count <= max_retries and results is None: - try: - results = sparql.query().convert() - - except HTTPError as http_err: - print(f"HTTPError occurred: {http_err}") - - except IncompleteRead as read_err: - print(f"Incomplete read error occurred: {read_err}") - - try_count += 1 - - if results is None: - if try_count <= max_retries: - print("The query will be retried...") - - else: - print("Query failed after retries.") - return None - - # Check if the query returned any results. - if results is None: - print("Total number of lexemes: Not found") - return None - - res_dict = cast(dict[str, Any], results) - if ( - "results" in res_dict - and "bindings" in res_dict["results"] - and len(res_dict["results"]["bindings"]) > 0 - ): - total_lexemes = int( - res_dict.get("results", {}).get("bindings", [])[0]["total"]["value"] - ) - - output_template = "" - if language: - output_template += f"\nLanguage: {language.capitalize()}\n" - - if data_type: - output_template += f"Data type: {data_type}\n" - - output_template += f"Total number of lexemes: {total_lexemes:,}\n" - if do_print: - print(output_template) - - return total_lexemes - - print("Total number of lexemes: Not found") - return None - - -# MARK: Wrapper - - -def total_wrapper( - languages: list[str] | None = None, - data_types: list[str] | None = None, - all_bool: bool = False, - wikidata_dump: Path | bool | None = None, -) -> None: - """ - Conditionally provides the full functionality of the total command. - - Parameters - ---------- - languages : List[str] - The language(s) to potentially total data types for. - - data_types : List[str] - The data type(s) to check for. - - all_bool : bool - Whether all languages and data types should be listed. - - wikidata_dump : Optional[Union[Path, bool]] - The local Wikidata lexeme dump path that can be used to process data. - If True, indicates the flag was used without a path. - - Notes - ----- - Now accepts lists for language and data type to output a table of total lexemes. - """ - # Note: Handle --all flag via 'or ["all"]' assignments. - # Flag without a wikidata lexeme dump path. - if wikidata_dump is True: - parse_wd_lexeme_dump( - languages=languages or ["all"], - data_types=data_types or ["all"], - wikidata_dump_type=["total"], - wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - ) - return - - # If user provided a wikidata lexeme dump path. - if isinstance(wikidata_dump, Path): - parse_wd_lexeme_dump( - languages=languages or ["all"], - data_types=data_types or ["all"], - wikidata_dump_type=["total"], - wikidata_dump_path=wikidata_dump, - ) - return - - language = languages[0] if languages else None # in case only one is passed - data_type = data_types[0] if data_types else None # in case only one is passed - - if (not languages and not data_types) and all_bool: - print_total_lexemes() - - elif languages and data_types and (len(languages) > 1 or len(data_types) > 1): - print(f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}") - print("=" * 70) - - for lang in languages: - # Flag to check if it's the first data type for the language. - first_row = True - - for dt in data_types: - total_lexemes = get_total_lexemes( - language=lang, data_type=dt, do_print=False - ) - total_lexemes = ( - f"{int(total_lexemes):,}" if total_lexemes is not None else "N/A" - ) - if first_row: - print(f"{lang:<20} {dt:<25} {total_lexemes:<25}") - first_row = False - - else: - print( - f"{'':<20} {dt:<25} {total_lexemes:<25}" - ) # print empty space for language - - print() - - elif language is not None and data_type is None: - print_total_lexemes(language=language) - - elif language is not None and data_type is not None and not all_bool: - get_total_lexemes(language=language, data_type=data_type) - - elif language is not None and data_type is not None: - print( - f"You have already specified language {language.capitalize()} and data type {data_type} - no need to specify --all." - ) - get_total_lexemes(language=language, data_type=data_type) - - else: - raise ValueError("Invalid input or missing information") diff --git a/src/scribe_data/cli/total/__init__.py b/src/scribe_data/cli/total/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/scribe_data/cli/total/print_values.py b/src/scribe_data/cli/total/print_values.py new file mode 100644 index 000000000..31dad44a4 --- /dev/null +++ b/src/scribe_data/cli/total/print_values.py @@ -0,0 +1,185 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Functions to check the total language data available on Wikidata. +""" + +from scribe_data.cli.total.query import query_total_lexemes +from scribe_data.utils import ( + WIKIDATA_QUERIES_ALL_DATA_DIR, + check_qid_is_language, + data_type_metadata, + format_sublanguage_name, + language_metadata, + list_all_languages, +) + +# MARK: Data Types + + +def get_datatype_list(language: str) -> list | dict: + """ + Get the data types for a given language based on the project directory structure. + + Parameters + ---------- + language : str + The language to return data types for. + + Returns + ------- + list | dict + A list of the corresponding data types. + """ + language_key = language.strip().lower() # normalize input + languages = list_all_languages(language_metadata) + + # Adjust language_key for sub-languages using the format_sublanguage_name function. + formatted_language = format_sublanguage_name(language_key, language_metadata) + language_key = formatted_language.split(" ")[ + 0 + ].lower() # use the main language part if formatted + + if language_key in languages: + if "sub_languages" in language_metadata[language_key]: + sub_languages = language_metadata[language_key]["sub_languages"] + data_types = [] + + for sub_lang_key in sub_languages: + sub_lang_dir = ( + WIKIDATA_QUERIES_ALL_DATA_DIR / sub_languages[sub_lang_key]["iso"] + ) + if sub_lang_dir.exists(): + data_types.extend( + [f.name for f in sub_lang_dir.iterdir() if f.is_dir()] + ) + + if not data_types: + raise ValueError( + f"No data types available for sub-languages of '{formatted_language.capitalize()}'." + ) + + return sorted(set(data_types)) # remove duplicates and sort + + else: + language_dir = WIKIDATA_QUERIES_ALL_DATA_DIR / language_key + if not language_dir.exists(): + raise ValueError(f"Directory '{language_dir}' does not exist.") + + data_types = [f.name for f in language_dir.iterdir() if f.is_dir()] + + if not data_types: + raise ValueError( + f"No data types available for language '{formatted_language.capitalize()}'." + ) + + return sorted(data_types) + + else: + return data_type_metadata + + +# MARK: Print + + +def print_total_lexemes(language: str | None = None) -> None: + """ + Print the total number of available entities for all data types. + + Parameters + ---------- + language : str, optional + The language to display data type entity counts for. + + Returns + ------- + str + A formatted string indicating the language, data type, and total number of lexemes for all the languages, if found. + """ + if language is None: + print("Returning total counts for all languages and data types...\n") + + elif ( + isinstance(language, str) + and language.startswith("Q") + and language[1:].isdigit() + ): + print( + f"Wikidata QID {language.capitalize()} passed. Checking validity and then all data types." + ) + language = check_qid_is_language(qid=language) + + else: + print(f"Returning total counts for {language.capitalize()} data types...\n") + + def print_total_header(language: str, dt: str, total_lexemes: str) -> None: + """ + Print the header of the total command output. + + Parameters + ---------- + language : str + The language for which to count lexemes. + + dt : str + The data type (e.g., "nouns", "verbs") for which to count lexemes. + + total_lexemes : str + The total number of lexemes derived formatted as a string. + + Returns + ------- + None + A message is printed to the terminal about the total number of lexemes. + """ + language_display = ( + "All Languages" if language is None else language.capitalize() + ) + print(f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}") + print("=" * 70) + print(f"{language_display:<20} {dt.replace('_', '-'): <25} {total_lexemes:<25}") + + if language is None: # all languages + languages = list_all_languages(language_metadata) + + for lang in languages: + data_types = get_datatype_list(lang) + + first_row = True + for dt in data_types: + total_lexemes = query_total_lexemes( + language=lang, data_type=dt, do_print=False + ) + total_lexemes = f"{total_lexemes:,}" + if first_row: + print_total_header(lang, dt, total_lexemes) + first_row = False + + else: + print(f"{'':<20} {dt.replace('_', ' '): <25} {total_lexemes:<25}") + + print() + + else: # individual language + first_row = True + if language.startswith("Q") and language[1:].isdigit(): + data_types = data_type_metadata + for t in ["emoji_keywords"]: + if t in data_types: + del data_types[t] + + else: + data_types = get_datatype_list(language) + + for dt in data_types: + total_lexemes = query_total_lexemes( + language=language, data_type=dt, do_print=False + ) + total_lexemes = f"{total_lexemes:,}" + if first_row: + print_total_header(language, dt, total_lexemes) + first_row = False + + else: + print(f"{'':<20} {dt.replace('_', ' '): <25} {total_lexemes:<25}") + + print() diff --git a/src/scribe_data/cli/total/query.py b/src/scribe_data/cli/total/query.py new file mode 100644 index 000000000..10b72e1f7 --- /dev/null +++ b/src/scribe_data/cli/total/query.py @@ -0,0 +1,176 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Functions to check the total language data available on Wikidata. +""" + +from http.client import IncompleteRead +from typing import Any, cast +from urllib.error import HTTPError + +from SPARQLWrapper import JSON + +from scribe_data.utils import ( + data_type_metadata, + language_to_qid, +) +from scribe_data.wikidata.wikidata_utils import sparql + +# MARK: QIDs + + +def get_qid_by_input(input_str: str | None) -> str | None: + """ + Retrieve the QID for a given language or data type input string. + + Parameters + ---------- + input_str : str, optional + The input string representing a language or data type. + + Returns + ------- + str | None + The QID corresponding to the input string, or- None if not found. + """ + if input_str: + if input_str in language_to_qid: + return language_to_qid[input_str] + + elif input_str in data_type_metadata: + return data_type_metadata[input_str] + + return None + + +# MARK: Query Total + + +def query_total_lexemes( + language: str, data_type: str, do_print: bool = True +) -> int | None: + """ + Query the total number of lexemes for a given language and data type from Wikidata. + + Parameters + ---------- + language : str + The language for which to count lexemes. + + data_type : str + The data type (e.g., "nouns", "verbs") for which to count lexemes. + + do_print : bool + Print the total lexemes for the given language and data type. + + Returns + ------- + str + A formatted string indicating the language, data type and total number of lexemes, if found. + """ + if ( + language is not None + and (language.startswith("Q") or language.startswith("q")) + and language[1:].isdigit() + ): + language_qid = language.capitalize() + + else: + language_qid = get_qid_by_input(language) + + if ( + data_type is not None + and (data_type.startswith("Q") or data_type.startswith("q")) + and data_type[1:].isdigit() + ): + data_type_qid = data_type.capitalize() + + else: + data_type_qid = get_qid_by_input(data_type) + + # MARK: Construct Query + + query_template = """ + SELECT + (COUNT(DISTINCT ?lexeme) as ?total) + + WHERE {{ + ?lexeme a ontolex:LexicalEntry . + {language_filter} + {data_type_filter} + }} + """ + + language_filter = ( + f"?lexeme dct:language wd:{language_qid} ." + if language_qid + else "?lexeme dct:language ?language ." + ) + + data_type_filter = ( + f"?lexeme wikibase:lexicalCategory wd:{data_type_qid} ." + if data_type_qid + else "?lexeme wikibase:lexicalCategory ?category ." + ) + + query = query_template.format( + language_filter=language_filter, data_type_filter=data_type_filter + ) + + # MARK: Query Results + + sparql.setQuery(query) + sparql.setReturnFormat(JSON) + try_count = 0 + max_retries = 2 + results = None + + while try_count <= max_retries and results is None: + try: + results = sparql.query().convert() + + except HTTPError as http_err: + print(f"HTTPError occurred: {http_err}") + + except IncompleteRead as read_err: + print(f"Incomplete read error occurred: {read_err}") + + try_count += 1 + + if results is None: + if try_count <= max_retries: + print("The query will be retried...") + + else: + print("Query failed after retries.") + return None + + # Check if the query returned any results. + if results is None: + print("Total number of lexemes: Not found") + return None + + res_dict = cast(dict[str, Any], results) + if ( + "results" in res_dict + and "bindings" in res_dict["results"] + and len(res_dict["results"]["bindings"]) > 0 + ): + total_lexemes = int( + res_dict.get("results", {}).get("bindings", [])[0]["total"]["value"] + ) + + output_template = "" + if language: + output_template += f"\nLanguage: {language.capitalize()}\n" + + if data_type: + output_template += f"Data type: {data_type}\n" + + output_template += f"Total number of lexemes: {total_lexemes:,}\n" + if do_print: + print(output_template) + + return total_lexemes + + print("Total number of lexemes: Not found") + return None diff --git a/src/scribe_data/cli/total/wrapper.py b/src/scribe_data/cli/total/wrapper.py new file mode 100644 index 000000000..13f0dd34a --- /dev/null +++ b/src/scribe_data/cli/total/wrapper.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Functions to check the total language data available on Wikidata. +""" + +from pathlib import Path + +from scribe_data.cli.total.print_values import print_total_lexemes +from scribe_data.cli.total.query import query_total_lexemes +from scribe_data.utils import DEFAULT_WIKIDATA_DUMP_EXPORT_DIR +from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump + +# MARK: Wrapper + + +def total_wrapper( + languages: list[str] | None = None, + data_types: list[str] | None = None, + all_bool: bool = False, + wikidata_dump: Path | bool | None = None, +) -> None: + """ + Conditionally provides the full functionality of the total command. + + Parameters + ---------- + languages : List[str] + The language(s) to potentially total data types for. + + data_types : List[str] + The data type(s) to check for. + + all_bool : bool + Whether all languages and data types should be listed. + + wikidata_dump : Optional[Union[Path, bool]] + The local Wikidata lexeme dump path that can be used to process data. + If True, indicates the flag was used without a path. + + Notes + ----- + Now accepts lists for language and data type to output a table of total lexemes. + """ + # Note: Handle --all flag via 'or ["all"]' assignments. + # Flag without a wikidata lexeme dump path. + if wikidata_dump is True: + parse_wd_lexeme_dump( + languages=languages or ["all"], + data_types=data_types or ["all"], + wikidata_dump_type=["total"], + wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, + ) + return + + # If user provided a wikidata lexeme dump path. + if isinstance(wikidata_dump, Path): + parse_wd_lexeme_dump( + languages=languages or ["all"], + data_types=data_types or ["all"], + wikidata_dump_type=["total"], + wikidata_dump_path=wikidata_dump, + ) + return + + language = languages[0] if languages else None # in case only one is passed + data_type = data_types[0] if data_types else None # in case only one is passed + + if (not languages and not data_types) and all_bool: + print_total_lexemes() + + elif languages and data_types and (len(languages) > 1 or len(data_types) > 1): + print(f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}") + print("=" * 70) + + for lang in languages: + # Flag to check if it's the first data type for the language. + first_row = True + + for dt in data_types: + total_lexemes = query_total_lexemes( + language=lang, data_type=dt, do_print=False + ) + total_lexemes = ( + f"{int(total_lexemes):,}" if total_lexemes is not None else "N/A" + ) + if first_row: + print(f"{lang:<20} {dt:<25} {total_lexemes:<25}") + first_row = False + + else: + print( + f"{'':<20} {dt:<25} {total_lexemes:<25}" + ) # print empty space for language + + print() + + elif language is not None and data_type is None: + print_total_lexemes(language=language) + + elif language is not None and data_type is not None and not all_bool: + query_total_lexemes(language=language, data_type=data_type) + + elif language is not None and data_type is not None: + print( + f"You have already specified language {language.capitalize()} and data type {data_type} - no need to specify --all." + ) + query_total_lexemes(language=language, data_type=data_type) + + else: + raise ValueError("Invalid input or missing information") diff --git a/src/scribe_data/cli/upgrade.py b/src/scribe_data/cli/upgrade.py index 1e4ae1054..fd3c63179 100644 --- a/src/scribe_data/cli/upgrade.py +++ b/src/scribe_data/cli/upgrade.py @@ -15,6 +15,8 @@ get_local_version, ) +# MARK: Upgrade + def upgrade_cli() -> None: """ diff --git a/src/scribe_data/cli/version.py b/src/scribe_data/cli/version.py index 4f491bad4..17f411600 100644 --- a/src/scribe_data/cli/version.py +++ b/src/scribe_data/cli/version.py @@ -11,6 +11,8 @@ UNKNOWN_VERSION_NOT_PIP = f"{UNKNOWN_VERSION} (Not installed via pip)" UNKNOWN_VERSION_NOT_FETCHED = f"{UNKNOWN_VERSION} (Unable to fetch version)" +# MARK: Local + def get_local_version() -> str: """ @@ -28,6 +30,9 @@ def get_local_version() -> str: return UNKNOWN_VERSION_NOT_PIP +# MARK: Latest + + def get_latest_version() -> str: """ Get the latest version of Scribe-Data from the GitHub repository. @@ -47,6 +52,9 @@ def get_latest_version() -> str: return UNKNOWN_VERSION_NOT_FETCHED +# MARK: Check Version + + def get_version_message() -> str: """ Return a message about the current and up to date versions of Scribe-Data. diff --git a/src/scribe_data/wikidata/parse_dump.py b/src/scribe_data/wikidata/parse_dump.py index d1549e4f3..521fe3839 100644 --- a/src/scribe_data/wikidata/parse_dump.py +++ b/src/scribe_data/wikidata/parse_dump.py @@ -436,12 +436,12 @@ def process_file(self, file_path: str, batch_size: int = 50000) -> None: "Would you like to automatically re-download the dump file?", default=True, ).ask(): - from scribe_data.cli.download import wd_lexeme_dump_download_wrapper + from scribe_data.cli.download.wikidata_lexeme_dump import ( + wd_lexeme_dump_download_wrapper, + ) if new_file_path := wd_lexeme_dump_download_wrapper( - dump_snapshot="latest-lexemes", - output_dir=Path(file_path).parent, - default=True, + dump_snapshot="latest-lexemes", default=True ): if isinstance(new_file_path, str): rprint( diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 13a81aaca..7195bf160 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -8,7 +8,9 @@ from rich import print as rprint from SPARQLWrapper import JSON, POST, SPARQLWrapper -from scribe_data.cli.download import wd_lexeme_dump_download_wrapper +from scribe_data.cli.download.wikidata_lexeme_dump import ( + wd_lexeme_dump_download_wrapper, +) from scribe_data.utils import ( DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, data_type_metadata, @@ -93,10 +95,7 @@ def parse_wd_lexeme_dump( f"Data types to process: {', '.join([d.capitalize() for d in data_types or []])}" ) - file_path = wd_lexeme_dump_download_wrapper( - dump_snapshot=None, - output_dir=wikidata_dump_path, - ) + file_path = wd_lexeme_dump_download_wrapper(dump_snapshot=None) if isinstance(file_path, (str, Path)): path = Path(file_path) diff --git a/src/scribe_data/wiktionary/parse_translations.py b/src/scribe_data/wiktionary/parse_translations.py index c32a6a156..cdcfce311 100644 --- a/src/scribe_data/wiktionary/parse_translations.py +++ b/src/scribe_data/wiktionary/parse_translations.py @@ -1305,7 +1305,7 @@ def _resolve_dump_path( import questionary - from scribe_data.cli.download import download_wiktionary_dumps + from scribe_data.cli.download.wiktionary_dump import download_wiktionary_dumps print(f"\nNo {wiktionary} dump found locally.") should_download = questionary.select( @@ -1316,9 +1316,7 @@ def _resolve_dump_path( if should_download == "Yes": downloaded_path = download_wiktionary_dumps( - output_dir=output_dir, - language_isos=[iso], - dump_snapshot="latest", + language_isos=[iso], dump_snapshot="latest" ) if downloaded_path and Path(downloaded_path).exists(): diff --git a/tests/check/test_check_project_metadata.py b/tests/check/test_check_project_metadata.py index 1927b8a02..f44dbbe46 100644 --- a/tests/check/test_check_project_metadata.py +++ b/tests/check/test_check_project_metadata.py @@ -16,7 +16,7 @@ class TestCheckProjectMetadata(unittest.TestCase): - def test_get_available_languages(self) -> None: + def test_check_get_available_languages(self) -> None: """ Tests that get_available_languages returns a dictionary with languages from WIKIDATA_QUERIES_ALL_DATA_DIR. @@ -75,7 +75,7 @@ def test_get_available_languages(self) -> None: self.assertEqual(len(available_languages), len(desired_dict)) - def test_get_missing_languages(self) -> None: + def test_check_get_missing_languages(self) -> None: """ Tests that get_missing_languages returns a list of languages missing given a target language dictionary to check for and a reference language @@ -120,7 +120,7 @@ def test_get_missing_languages(self) -> None: ["estonian/sub_estonian", "arabic", "chinese/mandarin"], ) - def test_validate_language_properties(self) -> None: + def test_check_validate_language_properties(self) -> None: """ Tests that validate_language_properties identifies languages missing an 'iso' and/or 'qid' field from a dictionary of languages. diff --git a/tests/cli/contracts/test_contracts_check.py b/tests/cli/contracts/test_cli_contracts_check.py similarity index 88% rename from tests/cli/contracts/test_contracts_check.py rename to tests/cli/contracts/test_cli_contracts_check.py index ea3fc0868..a4655bcd8 100644 --- a/tests/cli/contracts/test_contracts_check.py +++ b/tests/cli/contracts/test_cli_contracts_check.py @@ -12,9 +12,10 @@ from scribe_data.cli.contracts.check import ( check_contract_data_completeness, - check_contracts, + check_contract_data_print_missing, print_missing_forms, ) +from scribe_data.cli.contracts.filter import DEFAULT_DATA_CONTRACTS_DIR @pytest.fixture @@ -74,23 +75,19 @@ def mock_contract_metadata() -> dict[str, Any]: @patch("scribe_data.cli.contracts.check.check_contract_data_completeness") @patch("scribe_data.cli.contracts.check.print_missing_forms") -def test_check_contracts_with_dir( - mock_print: MagicMock, mock_check: MagicMock, mock_export_dir: Path -) -> None: +def test_check_contracts_with_dir(mock_print: MagicMock, mock_check: MagicMock) -> None: """ Test check_contracts with a specified directory. """ mock_check.return_value = {"English": {"verbs": ["future"]}} - check_contracts(output_dir=str(mock_export_dir)) - - mock_check.assert_called_once_with(mock_export_dir) + check_contract_data_print_missing(contracts_dir=DEFAULT_DATA_CONTRACTS_DIR) mock_print.assert_called_once_with({"English": {"verbs": ["future"]}}) @patch("scribe_data.cli.contracts.check.check_contract_data_completeness") @patch("scribe_data.cli.contracts.check.print_missing_forms") -def test_check_contracts_default_dir( +def test_cli_contracts_check_default_dir( mock_print: MagicMock, mock_check: MagicMock ) -> None: """ @@ -101,21 +98,21 @@ def test_check_contracts_default_dir( with patch("scribe_data.cli.contracts.check.Path") as mock_path: mock_path.return_value.exists.return_value = True - check_contracts() + check_contract_data_print_missing(contracts_dir=DEFAULT_DATA_CONTRACTS_DIR) mock_check.assert_called_once() mock_print.assert_called_once_with({}) @patch("scribe_data.cli.contracts.check.Path") -def test_check_contracts_nonexistent_dir(mock_path: MagicMock) -> None: +def test_cli_contracts_check_nonexistent_dir(mock_path: MagicMock) -> None: """ Test check_contracts with a nonexistent directory. """ mock_path.return_value.exists.return_value = False with patch("builtins.print") as mock_print: - check_contracts(output_dir="nonexistent_dir") + check_contract_data_print_missing(contracts_dir=DEFAULT_DATA_CONTRACTS_DIR) mock_print.assert_called_once() assert "Error: Directory" in mock_print.call_args[0][0] @@ -125,7 +122,7 @@ def test_check_contracts_nonexistent_dir(mock_path: MagicMock) -> None: @patch("scribe_data.cli.contracts.check.data_contracts_langs", ["English"]) @patch("scribe_data.cli.contracts.check.get_language_iso") @patch("scribe_data.cli.contracts.check.filter_contract_metadata") -def test_check_contract_data_completeness_json_error( +def test_cli_contracts_check_data_completeness_json_error( mock_filter_metadata: MagicMock, mock_get_iso: MagicMock, mock_export_dir: Path, @@ -148,7 +145,7 @@ def test_check_contract_data_completeness_json_error( assert "Error reading" in mock_print.call_args[0][0] -def test_print_missing_forms_none() -> None: +def test_cli_contracts_print_missing_forms_none() -> None: """ Test print_missing_forms with no missing forms. """ @@ -161,7 +158,7 @@ def test_print_missing_forms_none() -> None: ) -def test_print_missing_forms_with_missing() -> None: +def test_cli_contracts_print_missing_forms_with_missing() -> None: """ Test print_missing_forms with missing forms. """ diff --git a/tests/cli/contracts/test_export.py b/tests/cli/contracts/test_cli_contracts_export.py similarity index 61% rename from tests/cli/contracts/test_export.py rename to tests/cli/contracts/test_cli_contracts_export.py index 59671d599..95e223caf 100644 --- a/tests/cli/contracts/test_export.py +++ b/tests/cli/contracts/test_cli_contracts_export.py @@ -10,6 +10,7 @@ import pytest from scribe_data.cli.contracts.export import export_contracts +from scribe_data.utils import DEFAULT_CONTRACTS_EXPORT_DIR @pytest.fixture @@ -24,50 +25,46 @@ def contracts_source(tmp_path: Path) -> Path: return source -def test_export_contracts_fresh_export(tmp_path: Path, contracts_source: Path) -> None: +def test_cli_contracts_export_fresh_export( + tmp_path: Path, contracts_source: Path +) -> None: """ Test fresh export when no existing contracts folder. """ - output_dir = tmp_path / "output" / "contracts" - with patch( "scribe_data.cli.contracts.export.Path.__truediv__", return_value=contracts_source, ): - export_contracts(output_dir=output_dir) + export_contracts() - assert output_dir.exists() - assert (output_dir / "en.yaml").exists() - assert (output_dir / "de.yaml").exists() + assert DEFAULT_CONTRACTS_EXPORT_DIR.exists() + assert (DEFAULT_CONTRACTS_EXPORT_DIR / "en.yaml").exists() + assert (DEFAULT_CONTRACTS_EXPORT_DIR / "de.yaml").exists() -def test_export_contracts_success_message( +def test_cli_contracts_export_success_message( tmp_path: Path, contracts_source: Path, capsys ) -> None: """ Test success message after fresh export. """ - output_dir = tmp_path / "output" / "contracts" - with patch( "scribe_data.cli.contracts.export.Path.__truediv__", return_value=contracts_source, ): - export_contracts(output_dir=output_dir) + export_contracts() captured = capsys.readouterr() assert "successfully exported" in captured.out.lower() -def test_export_contracts_overwrite_confirmed( +def test_cli_contracts_export_overwrite_confirmed( tmp_path: Path, contracts_source: Path ) -> None: """ Test overwrite when user confirms with y. """ - output_dir = tmp_path / "output" / "contracts" - output_dir.mkdir(parents=True) - (output_dir / "old.yaml").write_text("old data") + (DEFAULT_CONTRACTS_EXPORT_DIR / "old.yaml").write_text("old data") with ( patch( @@ -76,21 +73,19 @@ def test_export_contracts_overwrite_confirmed( ), patch("builtins.input", return_value="y"), ): - export_contracts(output_dir=output_dir) + export_contracts() - assert (output_dir / "en.yaml").exists() - assert not (output_dir / "old.yaml").exists() + assert (DEFAULT_CONTRACTS_EXPORT_DIR / "en.yaml").exists() + assert not (DEFAULT_CONTRACTS_EXPORT_DIR / "old.yaml").exists() -def test_export_contracts_overwrite_declined( +def test_cli_contracts_export_overwrite_declined( tmp_path: Path, contracts_source: Path, capsys ) -> None: """ Test export cancelled when user declines with n. """ - output_dir = tmp_path / "output" / "contracts" - output_dir.mkdir(parents=True) - (output_dir / "old.yaml").write_text("old data") + (DEFAULT_CONTRACTS_EXPORT_DIR / "old.yaml").write_text("old data") with ( patch( @@ -99,53 +94,53 @@ def test_export_contracts_overwrite_declined( ), patch("builtins.input", return_value="n"), ): - export_contracts(output_dir=output_dir) + export_contracts() captured = capsys.readouterr() assert "cancelled" in captured.out.lower() - assert (output_dir / "old.yaml").exists() + assert (DEFAULT_CONTRACTS_EXPORT_DIR / "old.yaml").exists() -def test_export_contracts_source_not_found(tmp_path: Path) -> None: +def test_cli_contracts_export_source_not_found(tmp_path: Path) -> None: """ Test assertion error when source directory not found. """ fake_source = tmp_path / "nonexistent" - output_dir = tmp_path / "output" / "contracts" with patch( "scribe_data.cli.contracts.export.Path.__truediv__", return_value=fake_source, ): with pytest.raises(AssertionError): - export_contracts(output_dir=output_dir) + export_contracts() -def test_export_contracts_files_content(tmp_path: Path, contracts_source: Path) -> None: +def test_cli_contracts_export_files_content( + tmp_path: Path, contracts_source: Path +) -> None: """ Test that exported files have correct content. """ - output_dir = tmp_path / "output" / "contracts" - with patch( "scribe_data.cli.contracts.export.Path.__truediv__", return_value=contracts_source, ): - export_contracts(output_dir=output_dir) + export_contracts() - assert (output_dir / "en.yaml").read_text() == "language: english\n" - assert (output_dir / "de.yaml").read_text() == "language: german\n" + assert ( + DEFAULT_CONTRACTS_EXPORT_DIR / "en.yaml" + ).read_text() == "language: english\n" + assert ( + DEFAULT_CONTRACTS_EXPORT_DIR / "de.yaml" + ).read_text() == "language: german\n" -def test_export_contracts_overwrite_default_declined( +def test_cli_contracts_export_overwrite_default_declined( tmp_path: Path, contracts_source: Path, capsys ) -> None: """ Test that default response cancels export. """ - output_dir = tmp_path / "output" / "contracts" - output_dir.mkdir(parents=True) - with ( patch( "scribe_data.cli.contracts.export.Path.__truediv__", @@ -153,7 +148,7 @@ def test_export_contracts_overwrite_default_declined( ), patch("builtins.input", return_value=""), ): - export_contracts(output_dir=output_dir) + export_contracts() captured = capsys.readouterr() assert "cancelled" in captured.out.lower() diff --git a/tests/cli/contracts/test_contracts_export.py b/tests/cli/contracts/test_cli_contracts_filter.py similarity index 87% rename from tests/cli/contracts/test_contracts_export.py rename to tests/cli/contracts/test_cli_contracts_filter.py index cd617f5ef..3fa1e56c2 100644 --- a/tests/cli/contracts/test_contracts_export.py +++ b/tests/cli/contracts/test_cli_contracts_filter.py @@ -11,15 +11,13 @@ filter_contract_metadata, filter_exported_data, ) -from scribe_data.utils import ( - DEFAULT_DATA_CONTRACTS_DIR, - DEFAULT_FILTERED_JSON_EXPORT_DIR, - DEFAULT_JSON_EXPORT_DIR, -) +from scribe_data.utils import DEFAULT_DATA_CONTRACTS_DIR + +# MARK: Metadata -class TestFilterContractMetadata: - def test_filter_contract_metadata_empty_file(self) -> None: +class TestCLIContractFilterMetaData: + def test_cli_contracts_filter_metadata_empty_file(self) -> None: """ Test filtering with an empty contract file. """ @@ -31,7 +29,7 @@ def test_filter_contract_metadata_empty_file(self) -> None: "verbs": {"conjugations": []}, } - def test_filter_contract_metadata_numbers_dict(self) -> None: + def test_cli_contracts_filter_metadata_numbers_dict(self) -> None: """ Test filtering numbers as a dictionary. """ @@ -48,7 +46,7 @@ def test_filter_contract_metadata_numbers_dict(self) -> None: assert "" not in result["nouns"]["numbers"] assert "collective" in result["nouns"]["numbers"] - def test_filter_contract_metadata_numbers_list(self) -> None: + def test_cli_contracts_filter_metadata_numbers_list(self) -> None: """ Test filtering numbers as a list. """ @@ -61,7 +59,7 @@ def test_filter_contract_metadata_numbers_list(self) -> None: result = filter_contract_metadata(Path("fake_path.json")) assert set(result["nouns"]["numbers"]) == {"singular", "plural", "dual"} - def test_filter_contract_metadata_numbers_string(self) -> None: + def test_cli_contracts_filter_metadata_numbers_string(self) -> None: """ Test filtering numbers as a string. """ @@ -74,7 +72,7 @@ def test_filter_contract_metadata_numbers_string(self) -> None: result = filter_contract_metadata(Path("fake_path.json")) assert set(result["nouns"]["numbers"]) == {"singular", "plural", "dual"} - def test_filter_contract_metadata_genders(self) -> None: + def test_cli_contracts_filter_metadata_genders(self) -> None: """ Test filtering genders. """ @@ -93,7 +91,7 @@ def test_filter_contract_metadata_genders(self) -> None: assert "NOT_INCLUDED" not in result["nouns"]["genders"] assert "" not in result["nouns"]["genders"] - def test_filter_contract_metadata_conjugations_list(self) -> None: + def test_cli_contracts_filter_metadata_conjugations_list(self) -> None: """ Test filtering conjugations as a list. """ @@ -107,7 +105,7 @@ def test_filter_contract_metadata_conjugations_list(self) -> None: assert set(result["verbs"]["conjugations"]) == {"run", "runs", "ran"} assert "[running]" not in result["verbs"]["conjugations"] - def test_filter_contract_metadata_error_handling(self) -> None: + def test_cli_contracts_filter_metadata_error_handling(self) -> None: """ Test error handling for invalid YAML. """ @@ -118,8 +116,11 @@ def test_filter_contract_metadata_error_handling(self) -> None: mock_print.assert_called_once() -class TestFilterExportedData: - def test_filter_exported_data_nouns(self) -> None: +# MARK: Exported Data + + +class TestCLIContractsFilterExportedData: + def test_cli_contracts_filter_exported_data_nouns(self) -> None: """ Test filtering exported noun data. """ @@ -169,7 +170,7 @@ def test_filter_exported_data_nouns(self) -> None: assert result["L2"]["singular"] == "dog" assert "irrelevant" not in result["L2"] - def test_filter_exported_data_verbs(self) -> None: + def test_cli_contracts_filter_exported_data_verbs(self) -> None: """ Test filtering exported verb data. """ @@ -211,7 +212,7 @@ def test_filter_exported_data_verbs(self) -> None: # L4 should not be included as it doesn't have enough valid fields. assert "L4" not in result - def test_filter_exported_data_unsupported_type(self) -> None: + def test_cli_contracts_filter_exported_data_unsupported_type(self) -> None: """ Test filtering with unsupported data type. """ @@ -226,7 +227,7 @@ def test_filter_exported_data_unsupported_type(self) -> None: ) assert result == {} - def test_filter_exported_data_error_handling(self) -> None: + def test_cli_contracts_filter_exported_data_error_handling(self) -> None: """ Test error handling for invalid JSON. """ @@ -244,7 +245,10 @@ def test_filter_exported_data_error_handling(self) -> None: mock_print.assert_called_once() -class TestExportContracts: +# MARK: Export + + +class TestCLIContractsFilterExport: @patch("scribe_data.cli.contracts.filter.filter_contract_metadata") @patch("scribe_data.cli.contracts.filter.filter_exported_data") @patch("scribe_data.cli.contracts.filter.get_language_from_iso") @@ -253,7 +257,7 @@ class TestExportContracts: @patch("pathlib.Path.exists") @patch("builtins.open", new_callable=mock_open) @patch("json.dump") - def test_export_data_filtered_by_contracts( + def test_cli_contracts_filter_export_data_filtered_by_contracts( self, mock_json_dump: MagicMock, mock_file_open: MagicMock, @@ -327,11 +331,7 @@ def mock_path_glob(self: Path, pattern: str) -> list[Path]: with patch.object(Path, "glob", mock_path_glob): # Call the function. - export_data_filtered_by_contracts( - contracts_dir=DEFAULT_DATA_CONTRACTS_DIR, - input_dir="test_input", - output_dir="test_output", - ) + export_data_filtered_by_contracts(contracts_dir=DEFAULT_DATA_CONTRACTS_DIR) assert mock_mkdir.call_count >= 3 # main dir + 2 language dirs assert mock_filter_metadata.call_count == 2 # one for each language @@ -361,7 +361,7 @@ def mock_path_glob(self: Path, pattern: str) -> list[Path]: @patch("scribe_data.cli.contracts.filter.get_language_from_iso") @patch("os.listdir") @patch("pathlib.Path.mkdir") - def test_export_data_filtered_by_contracts_no_language_match( + def test_cli_contracts_filter_export_data_filtered_by_contracts_no_language_match( self, mock_mkdir: MagicMock, mock_listdir: MagicMock, @@ -375,11 +375,7 @@ def test_export_data_filtered_by_contracts_no_language_match( mock_get_language.return_value = None with patch("builtins.print") as mock_print: - export_data_filtered_by_contracts( - contracts_dir=DEFAULT_DATA_CONTRACTS_DIR, - input_dir=DEFAULT_JSON_EXPORT_DIR, - output_dir=DEFAULT_FILTERED_JSON_EXPORT_DIR, - ) + export_data_filtered_by_contracts(contracts_dir=DEFAULT_DATA_CONTRACTS_DIR) # Verify warning was printed. mock_print.assert_called_with( @@ -394,7 +390,7 @@ def test_export_data_filtered_by_contracts_no_language_match( @patch("os.listdir") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.exists") - def test_export_data_filtered_by_contracts_no_input_file( + def test_cli_contracts_filter_export_data_filtered_by_contracts_no_input_file( self, mock_exists: MagicMock, mock_mkdir: MagicMock, @@ -414,11 +410,7 @@ def test_export_data_filtered_by_contracts_no_input_file( } with patch("builtins.print") as mock_print: - export_data_filtered_by_contracts( - contracts_dir=DEFAULT_DATA_CONTRACTS_DIR, - input_dir=DEFAULT_JSON_EXPORT_DIR, - output_dir=DEFAULT_FILTERED_JSON_EXPORT_DIR, - ) + export_data_filtered_by_contracts(contracts_dir=DEFAULT_DATA_CONTRACTS_DIR) # Verify warning was printed - expects "No input directory found for English". assert mock_print.call_count >= 1 @@ -428,7 +420,7 @@ def test_export_data_filtered_by_contracts_no_input_file( @patch("scribe_data.cli.contracts.filter.get_language_from_iso") @patch("os.listdir") @patch("pathlib.Path.mkdir") - def test_export_data_filtered_by_contracts_empty_metadata( + def test_cli_contracts_filter_export_data_filtered_by_contracts_empty_metadata( self, mock_mkdir: MagicMock, mock_listdir: MagicMock, @@ -442,11 +434,7 @@ def test_export_data_filtered_by_contracts_empty_metadata( mock_get_language.return_value = "English" mock_filter_metadata.return_value = {} - export_data_filtered_by_contracts( - contracts_dir=DEFAULT_DATA_CONTRACTS_DIR, - input_dir=DEFAULT_JSON_EXPORT_DIR, - output_dir=DEFAULT_FILTERED_JSON_EXPORT_DIR, - ) + export_data_filtered_by_contracts(contracts_dir=DEFAULT_DATA_CONTRACTS_DIR) # Verify no further processing happens when metadata is empty. mock_filter_metadata.assert_called_once() diff --git a/tests/cli/convert/test_cli_convert_to_csv_or_tsv.py b/tests/cli/convert/test_cli_convert_to_csv_or_tsv.py new file mode 100644 index 000000000..acce141c6 --- /dev/null +++ b/tests/cli/convert/test_cli_convert_to_csv_or_tsv.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI convert functionality. +""" + +import unittest + +import pytest + +from scribe_data.cli.convert.to_csv_or_tsv import convert_to_csv_or_tsv +from scribe_data.utils import DEFAULT_CSV_EXPORT_DIR, DEFAULT_TSV_EXPORT_DIR + +# MARK: CSV or TSV + + +class TestCLIConvertToCSVorTSV(unittest.TestCase): + @pytest.fixture(autouse=True) + def _setup_fixtures(self, tmp_path): + self.tmp_path = tmp_path + + def test_cli_convert_to_csv_or_json_empty_language(self) -> None: + json_data = '{"key1": "value1", "key2": "value2"}' + + input_file = self.tmp_path / "test.json" + input_file.write_text(json_data, encoding="utf-8") + + with self.assertRaises(ValueError) as context: + convert_to_csv_or_tsv( + language="", + data_types="nouns", + input_file=input_file, + output_type="csv", + overwrite=True, + ) + + self.assertEqual(str(context.exception), "Language '' is not recognized.") + + def test_cli_convert_to_csv_or_tsv_standard_dict_to_csv(self) -> None: + json_data = '{"a": "1", "b": "2"}' + expected_csv_output = "preposition,value\na,1\nb,2\n" + + input_file = self.tmp_path / "test.json" + input_file.write_text(json_data, encoding="utf-8") + + convert_to_csv_or_tsv( + language="English", + data_types="prepositions", + input_file=input_file, + output_type="csv", + overwrite=True, + ) + + output_file = DEFAULT_CSV_EXPORT_DIR / "English" / "prepositions.csv" + actual_content = output_file.read_text(encoding="utf-8") + assert actual_content == expected_csv_output + + def test_cli_convert_to_csv_or_tsv_standard_dict_to_tsv(self) -> None: + json_data = '{"a": "1", "b": "2"}' + expected_tsv_output = "preposition\tvalue\na\t1\nb\t2\n" + + input_file = self.tmp_path / "test.json" + input_file.write_text(json_data, encoding="utf-8") + + convert_to_csv_or_tsv( + language="English", + data_types="prepositions", + input_file=input_file, + output_type="tsv", + overwrite=True, + ) + + output_file = DEFAULT_TSV_EXPORT_DIR / "English" / "prepositions.tsv" + actual_content = output_file.read_text(encoding="utf-8") + assert actual_content == expected_tsv_output + + def test_cli_convert_to_csv_or_tsv_nested_dict_to_csv(self) -> None: + json_data = ( + '{"a": {"value1": "1", "value2": "x"}, "b": {"value1": "2", "value2": "y"}}' + ) + expected_csv_output = "noun,value1,value2\na,1,x\nb,2,y\n" + + input_file = self.tmp_path / "test.json" + input_file.write_text(json_data, encoding="utf-8") + + convert_to_csv_or_tsv( + language="English", + data_types="nouns", + input_file=input_file, + output_type="csv", + overwrite=True, + ) + + output_file = DEFAULT_CSV_EXPORT_DIR / "English" / "nouns.csv" + actual_content = output_file.read_text(encoding="utf-8") + assert actual_content == expected_csv_output + + def test_cli_convert_to_csv_or_tsv_nested_dict_to_tsv(self) -> None: + json_data = ( + '{"a": {"value1": "1", "value2": "x"}, "b": {"value1": "2", "value2": "y"}}' + ) + expected_tsv_output = "noun\tvalue1\tvalue2\na\t1\tx\nb\t2\ty\n" + + input_file = self.tmp_path / "test.json" + input_file.write_text(json_data, encoding="utf-8") + + convert_to_csv_or_tsv( + language="English", + data_types="nouns", + input_file=input_file, + output_type="tsv", + overwrite=True, + ) + + output_file = DEFAULT_TSV_EXPORT_DIR / "English" / "nouns.tsv" + actual_content = output_file.read_text(encoding="utf-8") + assert actual_content == expected_tsv_output + + def test_cli_convert_to_csv_or_tsv_list_of_dicts_to_csv(self) -> None: + json_data = '{"a": [{"emoji": "😀", "is_base": true, "rank": 1}, {"emoji": "😅", "is_base": false, "rank": 2}]}' + expected_csv_output = "word,emoji,is_base,rank\na,😀,True,1\na,😅,False,2\n" + + input_file = self.tmp_path / "test.json" + input_file.write_text(json_data, encoding="utf-8") + + convert_to_csv_or_tsv( + language="English", + data_types="emoji-keywords", + input_file=input_file, + output_type="csv", + overwrite=True, + ) + + output_file = DEFAULT_CSV_EXPORT_DIR / "English" / "emoji-keywords.csv" + actual_content = output_file.read_text(encoding="utf-8") + assert actual_content == expected_csv_output + + def test_cli_convert_to_csv_or_tsv_list_of_dicts_to_tsv(self) -> None: + json_data = '{"a": [{"emoji": "😀", "is_base": true, "rank": 1}, {"emoji": "😅", "is_base": false, "rank": 2}]}' + expected_tsv_output = ( + "word\temoji\tis_base\trank\na\t😀\tTrue\t1\na\t😅\tFalse\t2\n" + ) + + input_file = self.tmp_path / "test.json" + input_file.write_text(json_data, encoding="utf-8") + + convert_to_csv_or_tsv( + language="English", + data_types="emoji-keywords", + input_file=input_file, + output_type="tsv", + overwrite=True, + ) + + output_file = DEFAULT_TSV_EXPORT_DIR / "English" / "emoji-keywords.tsv" + actual_content = output_file.read_text(encoding="utf-8") + assert actual_content == expected_tsv_output diff --git a/tests/cli/convert/test_cli_convert_to_json.py b/tests/cli/convert/test_cli_convert_to_json.py new file mode 100644 index 000000000..0c3b1449f --- /dev/null +++ b/tests/cli/convert/test_cli_convert_to_json.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI convert functionality. +""" + +import json +import unittest +from io import StringIO +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from scribe_data.cli.convert.to_json import convert_to_json +from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR + +# MARK: JSON + + +class TestCLIConvertToJSON(unittest.TestCase): + @pytest.fixture(autouse=True) + def _setup_fixtures(self, tmp_path): + self.tmp_path = tmp_path + + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_cli_convert_to_json_empty_language(self, mock_path: MagicMock) -> None: + csv_data = "key,value\na,1\nb,2" + mock_file = StringIO(csv_data) + + mock_path_obj = MagicMock(spec=Path) + mock_path.return_value = mock_path_obj + mock_path_obj.suffix = ".csv" + mock_path_obj.exists.return_value = True + mock_path_obj.open.return_value.__enter__.return_value = mock_file + + with self.assertRaises(ValueError) as context: + convert_to_json( + language="", + data_types="nouns", + input_file=Path("input.csv"), + output_type="json", + overwrite=True, + ) + self.assertIn("Language '' is not recognized.", str(context.exception)) + + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_cli_convert_to_json_supported_file_extension_csv( + self, mock_path_class: MagicMock + ) -> None: + mock_path_instance = MagicMock(spec=Path) + + mock_path_class.return_value = mock_path_instance + + mock_path_instance.suffix = ".csv" + mock_path_instance.exists.return_value = True + + convert_to_json( + language="English", + data_types="nouns", + input_file=Path("test.csv"), + output_type="json", + overwrite=True, + ) + + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_cli_convert_to_json_supported_file_extension_tsv( + self, mock_path_class: MagicMock + ) -> None: + mock_path_instance = MagicMock(spec=Path) + + mock_path_class.return_value = mock_path_instance + + mock_path_instance.suffix = ".tsv" + mock_path_instance.exists.return_value = True + + convert_to_json( + language="English", + data_types="nouns", + input_file=Path("test.tsv"), + output_type="json", + overwrite=True, + ) + + def test_cli_convert_to_json_unsupported_file_extension(self) -> None: + input_file = self.tmp_path / "test.txt" + input_file.write_text("Hello, world!", encoding="utf-8") + + with self.assertRaises(ValueError) as context: + convert_to_json( + language="English", + data_types="nouns", + input_file=input_file, + output_type="json", + overwrite=True, + ) + + self.assertIn("Unsupported file extension", str(context.exception)) + self.assertEqual( + str(context.exception), + f"Unsupported file extension '.txt' for {input_file}. Please provide a '.csv' or '.tsv' file.", + ) + + def test_cli_convert_to_json_standard_csv(self) -> None: + csv_data = "key,value\na,1\nb,2" + expected_json_output = {"a": "1", "b": "2"} + + input_file = self.tmp_path / "test.csv" + input_file.write_text(csv_data, encoding="utf-8") + + convert_to_json( + language="English", + data_types="nouns", + input_file=input_file, + output_type="json", + overwrite=True, + ) + + output_file = DEFAULT_JSON_EXPORT_DIR / "English" / "nouns.json" + with open(output_file, "r", encoding="utf-8") as f: + actual_content = json.load(f) + + assert actual_content == expected_json_output + + def test_cli_convert_to_json_with_multiple_keys(self) -> None: + csv_data = "key,value1,value2\na,1,x\nb,2,y\nc,3,z" + expected_json_output = { + "a": {"value1": "1", "value2": "x"}, + "b": {"value1": "2", "value2": "y"}, + "c": {"value1": "3", "value2": "z"}, + } + + input_file = self.tmp_path / "test.csv" + input_file.write_text(csv_data, encoding="utf-8") + + convert_to_json( + language="English", + data_types="nouns", + input_file=input_file, + output_type="json", + overwrite=True, + ) + + output_file = DEFAULT_JSON_EXPORT_DIR / "English" / "nouns.json" + with open(output_file, "r", encoding="utf-8") as f: + actual_content = json.load(f) + + assert actual_content == expected_json_output + + def test_cli_convert_to_json_with_complex_structure(self) -> None: + csv_data = "key,emoji,is_base,rank\na,😀,true,1\nb,😅,false,2" + expected_json_output = { + "a": [{"emoji": "😀", "is_base": True, "rank": 1}], + "b": [{"emoji": "😅", "is_base": False, "rank": 2}], + } + + input_file = self.tmp_path / "test.csv" + input_file.write_text(csv_data, encoding="utf-8") + + convert_to_json( + language="English", + data_types="nouns", + input_file=input_file, + output_type="json", + overwrite=True, + ) + + output_file = DEFAULT_JSON_EXPORT_DIR / "English" / "nouns.json" + with open(output_file, "r", encoding="utf-8") as f: + actual_content = json.load(f) + + assert actual_content == expected_json_output diff --git a/tests/load/test_data_to_sqlite.py b/tests/cli/convert/test_cli_convert_to_sqlite.py similarity index 84% rename from tests/load/test_data_to_sqlite.py rename to tests/cli/convert/test_cli_convert_to_sqlite.py index 08701dd86..03f5c0612 100644 --- a/tests/load/test_data_to_sqlite.py +++ b/tests/cli/convert/test_cli_convert_to_sqlite.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-3.0-or-later """ -Test the data_to_sqlite function. +Test the convert_to_sqlite function. """ import json @@ -11,13 +11,14 @@ import pytest -from scribe_data.load.data_to_sqlite import ( +from scribe_data.cli.convert.to_sqlite import ( + convert_to_sqlite, create_table, - data_to_sqlite, table_insert, translations_to_sqlite, wiktionary_translations_to_sqlite, ) +from scribe_data.utils import DEFAULT_SQLITE_EXPORT_DIR @pytest.fixture @@ -63,6 +64,9 @@ def temp_json_dir(tmp_path: Path) -> Path: return json_dir +# MARK: Operations + + def test_create_table(temp_db: Any) -> None: """ Test creating a table with both snake and camel case identifiers. @@ -106,22 +110,21 @@ def translations_setup(tmp_path: Path) -> dict[str, Any]: """ Pytest fixture to handle common setup for translations_to_sqlite tests. """ - output_dir = tmp_path / "sqlite_output" - output_dir.mkdir() - lang_data_type_dict = {"english": ["translations"]} current_languages = ["english", "german", "french"] - expected_db_path = output_dir / "TranslationData.sqlite" + expected_db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" return { - "output_dir": output_dir, "lang_data_type_dict": lang_data_type_dict, "current_languages": current_languages, "expected_db_path": expected_db_path, } -def test_translations_to_sqlite( +# MARK: Conversions + + +def test_cli_convert_translations_to_sqlite( temp_json_dir: Path, translations_setup: dict[str, Any] ) -> None: """ @@ -132,12 +135,11 @@ def test_translations_to_sqlite( translations_setup["lang_data_type_dict"], translations_setup["current_languages"], input_file=str(temp_json_dir), - output_file=str(translations_setup["output_dir"]), overwrite=True, ) # Verify database creation. - db_path = translations_setup["output_dir"] / "TranslationData.sqlite" + db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" assert db_path.exists() # Check database contents. @@ -151,7 +153,7 @@ def test_translations_to_sqlite( conn.close() -def test_overwrite_existing_file_user_confirms( +def test_cli_convert_overwrite_existing_file_user_confirms( temp_json_dir: Path, translations_setup: dict[str, Any] ) -> None: """ @@ -168,7 +170,6 @@ def test_overwrite_existing_file_user_confirms( translations_setup["lang_data_type_dict"], translations_setup["current_languages"], input_file=str(temp_json_dir), - output_file=str(translations_setup["output_dir"]), overwrite=False, ) @@ -177,7 +178,7 @@ def test_overwrite_existing_file_user_confirms( mock_remove.assert_called_once_with(translations_setup["expected_db_path"]) -def test_overwrite_existing_file_user_declines( +def test_cli_convert_overwrite_existing_file_user_declines( temp_json_dir: Path, translations_setup: dict[str, Any] ) -> None: """ @@ -195,7 +196,6 @@ def test_overwrite_existing_file_user_declines( translations_setup["lang_data_type_dict"], translations_setup["current_languages"], input_file=str(temp_json_dir), - output_file=str(translations_setup["output_dir"]), overwrite=False, ) @@ -204,7 +204,7 @@ def test_overwrite_existing_file_user_declines( mock_print.assert_called_with("Skipping translation DB creation.") -def test_translations_to_sqlite_missing_json( +def test_cli_convert_translations_to_sqlite_missing_json( temp_json_dir: Path, translations_setup: dict[str, Any], capsys: Any ) -> None: """ @@ -219,7 +219,6 @@ def test_translations_to_sqlite_missing_json( input_file=str( temp_json_dir ), # temp_json_dir won't have 'nonexistent_lang/translations.json' - output_file=str(translations_setup["output_dir"]), overwrite=True, ) @@ -242,7 +241,7 @@ def __getattr__(self, name: str) -> Any: return getattr(self._conn, name) -def test_translations_to_sqlite_commit_error( +def test_cli_convert_translations_to_sqlite_commit_error( temp_json_dir: Path, translations_setup: dict[str, Any], capsys: Any ) -> None: """ @@ -260,7 +259,6 @@ def mock_connect(*args: Any, **kwargs: Any) -> MockConnection: translations_setup["lang_data_type_dict"], translations_setup["current_languages"], input_file=str(temp_json_dir), - output_file=str(translations_setup["output_dir"]), overwrite=True, ) @@ -269,15 +267,15 @@ def mock_connect(*args: Any, **kwargs: Any) -> MockConnection: assert "mock commit error" in captured.out -def test_data_to_sqlite_invalid_language() -> None: +def test_cli_convert_to_sqlite_invalid_language() -> None: """ - Test data_to_sqlite with invalid language. + Test convert_to_sqlite with invalid language. """ with pytest.raises(ValueError): - data_to_sqlite(languages=["invalid_language"]) + convert_to_sqlite(languages=["invalid_language"]) -def test_create_table_duplicate_columns(temp_db: Any) -> None: +def test_cli_convert_create_table_duplicate_columns(temp_db: Any) -> None: """ Test creating a table with duplicate column names. """ @@ -294,11 +292,9 @@ def test_create_table_duplicate_columns(temp_db: Any) -> None: assert len(set(columns)) == 3 # all columns should be unique -def test_data_to_sqlite_translations_and_nouns(tmp_path: Path) -> None: +def test_cli_convert_convert_to_sqlite_translations_and_nouns(tmp_path: Path) -> None: input_dir = tmp_path / "input" - output_dir = tmp_path / "output" input_dir.mkdir() - output_dir.mkdir() # Create language folder. english_dir = input_dir / "english" @@ -320,20 +316,19 @@ def test_data_to_sqlite_translations_and_nouns(tmp_path: Path) -> None: } (english_dir / "nouns.json").write_text(json.dumps(nouns_data)) - data_to_sqlite( + convert_to_sqlite( languages=["english"], specific_tables=None, input_file=str(input_dir), - output_file=str(output_dir), overwrite=True, ) # Assert TranslationData.sqlite exists. - translation_db = output_dir / "TranslationData.sqlite" + translation_db = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" assert translation_db.exists() # Assert ENLanguageData.sqlite (for other tables) exists. - noun_db = output_dir / "ENLanguageData.sqlite" + noun_db = DEFAULT_SQLITE_EXPORT_DIR / "ENLanguageData.sqlite" assert noun_db.exists() # Check nouns table created and has data. @@ -352,7 +347,7 @@ def test_data_to_sqlite_translations_and_nouns(tmp_path: Path) -> None: assert len(scribe_row) == 1 -def test_data_to_sqlite_skips_missing_json(tmp_path: Path) -> None: +def test_cli_convert_convert_to_sqlite_skips_missing_json(tmp_path: Path) -> None: input_dir = tmp_path / "input" input_dir.mkdir() lang_dir = input_dir / "english" @@ -364,19 +359,22 @@ def test_data_to_sqlite_skips_missing_json(tmp_path: Path) -> None: mock.patch("scribe_data.utils.data_type_metadata", {"nouns": None}), mock.patch("scribe_data.utils.language_metadata", {"english": {}}), mock.patch("scribe_data.utils.list_all_languages", return_value=["english"]), - mock.patch("scribe_data.load.data_to_sqlite.create_table") as mock_create_table, - mock.patch("scribe_data.load.data_to_sqlite.table_insert") as mock_table_insert, + mock.patch( + "scribe_data.cli.convert.to_sqlite.create_table" + ) as mock_create_table, + mock.patch( + "scribe_data.cli.convert.to_sqlite.table_insert" + ) as mock_table_insert, mock.patch( "scribe_data.utils.get_language_iso", side_effect=lambda lang: lang[:2].upper(), ), ): - # Run data_to_sqlite for 'nouns' only, but JSON file missing. - data_to_sqlite( + # Run convert_to_sqlite for 'nouns' only, but JSON file missing. + convert_to_sqlite( languages=["english"], specific_tables=["nouns"], input_file=str(input_dir), - output_file=str(tmp_path / "output"), overwrite=True, ) @@ -385,17 +383,12 @@ def test_data_to_sqlite_skips_missing_json(tmp_path: Path) -> None: assert not mock_table_insert.called -# MARK: Wiktionary translations to SQLite - - -def test_wiktionary_translations_to_sqlite_basic(tmp_path): +def test_cli_convert_wiktionary_translations_to_sqlite_basic(tmp_path): """ Test basic wiktionary_translations_to_sqlite conversion. """ input_dir = tmp_path / "input" - output_dir = tmp_path / "output" input_dir.mkdir() - output_dir.mkdir() # Create language directory. lang_dir = input_dir / "english" @@ -437,12 +430,11 @@ def test_wiktionary_translations_to_sqlite_basic(tmp_path): language="english", identifier_case="snake", input_file=str(input_dir), - output_file=str(output_dir), overwrite=True, ) # Verify database was created. - db_path = output_dir / "TranslationData.sqlite" + db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" assert db_path.exists() conn = sqlite3.connect(db_path) @@ -474,14 +466,12 @@ def test_wiktionary_translations_to_sqlite_basic(tmp_path): conn.close() -def test_wiktionary_translations_to_sqlite_camel_case(tmp_path): +def test_cli_convert_wiktionary_translations_to_sqlite_camel_case(tmp_path): """ Test wiktionary_translations_to_sqlite with camelCase identifiers. """ input_dir = tmp_path / "input" - output_dir = tmp_path / "output" input_dir.mkdir() - output_dir.mkdir() lang_dir = input_dir / "english" lang_dir.mkdir() @@ -500,11 +490,10 @@ def test_wiktionary_translations_to_sqlite_camel_case(tmp_path): language="english", identifier_case="camel", input_file=str(input_dir), - output_file=str(output_dir), overwrite=True, ) - db_path = output_dir / "TranslationData.sqlite" + db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" conn = sqlite3.connect(db_path) cursor = conn.cursor() @@ -515,15 +504,11 @@ def test_wiktionary_translations_to_sqlite_camel_case(tmp_path): conn.close() -def test_wiktionary_translations_to_sqlite_missing_dir(tmp_path, capsys): +def test_cli_convert_wiktionary_translations_to_sqlite_missing_dir(tmp_path, capsys): """ Test wiktionary_translations_to_sqlite with non-existent language directory. """ - wiktionary_translations_to_sqlite( - language="nonexistent", - input_file=str(tmp_path), - output_file=str(tmp_path / "output"), - ) + wiktionary_translations_to_sqlite(language="nonexistent", input_file=str(tmp_path)) captured = capsys.readouterr() assert "Warning" in captured.out @@ -538,20 +523,13 @@ def test_wiktionary_translations_to_sqlite_no_translation_files(tmp_path): lang_dir = input_dir / "english" lang_dir.mkdir(parents=True) - output_dir = tmp_path / "output" - output_dir.mkdir() - # Create a non-translation JSON file. (lang_dir / "nouns.json").write_text("{}") - wiktionary_translations_to_sqlite( - language="english", - input_file=str(input_dir), - output_file=str(output_dir), - ) + wiktionary_translations_to_sqlite(language="english", input_file=str(input_dir)) # No TranslationData.sqlite should be created. - db_path = output_dir / "TranslationData.sqlite" + db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" assert not db_path.exists() @@ -560,9 +538,7 @@ def test_wiktionary_translations_to_sqlite_multiple_files(tmp_path): Test wiktionary_translations_to_sqlite with multiple translation files. """ input_dir = tmp_path / "input" - output_dir = tmp_path / "output" input_dir.mkdir() - output_dir.mkdir() lang_dir = input_dir / "english" lang_dir.mkdir() @@ -582,11 +558,10 @@ def test_wiktionary_translations_to_sqlite_multiple_files(tmp_path): language="english", identifier_case="snake", input_file=str(input_dir), - output_file=str(output_dir), overwrite=True, ) - db_path = output_dir / "TranslationData.sqlite" + db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" conn = sqlite3.connect(db_path) cursor = conn.cursor() diff --git a/tests/cli/convert/test_cli_convert_wrapper.py b/tests/cli/convert/test_cli_convert_wrapper.py new file mode 100644 index 000000000..576f917e6 --- /dev/null +++ b/tests/cli/convert/test_cli_convert_wrapper.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI convert functionality. +""" + +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from scribe_data.cli.convert.wrapper import convert_wrapper + +# MARK: Wrapper + + +class TestCLIConvertWrapper(unittest.TestCase): + @pytest.fixture(autouse=True) + def _setup_fixtures(self, tmp_path): + self.tmp_path = tmp_path + + @patch("scribe_data.cli.convert.Path", autospec=True) + @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) + @patch("shutil.copy") + def test_cli_convert_to_sqlite( + self, + mock_shutil_copy: MagicMock, + mock_data_to_sqlite: MagicMock, + mock_path: MagicMock, + ) -> None: + mock_path.return_value.exists.return_value = True + + convert_wrapper( + languages=["english"], + data_types=["nouns"], + input_path=Path("file"), + output_type="sqlite", + overwrite=True, + identifier_case="camel", + ) + + mock_data_to_sqlite.assert_called_with( + languages=["english"], + specific_tables=["nouns"], + identifier_case="camel", + input_file=Path("file"), + output_file=Path("/output"), + overwrite=True, + ) + + @patch("scribe_data.cli.convert.Path", autospec=True) + @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) + def test_cli_convert_to_sqlite_no_output_dir( + self, mock_data_to_sqlite: MagicMock, mock_path: MagicMock + ) -> None: + mock_input_file = MagicMock() + mock_input_file.exists.return_value = True + + mock_path.return_value = mock_input_file + + mock_input_file.parent = MagicMock() + mock_input_file.parent.__truediv__.return_value = MagicMock() + mock_input_file.parent.__truediv__.return_value.exists.return_value = False + + convert_wrapper( + languages=["english"], + data_types=["nouns"], + input_path=Path(mock_input_file), + output_type="sqlite", + overwrite=True, + identifier_case="camel", + ) + + mock_data_to_sqlite.assert_called_with( + languages=["english"], + specific_tables=["nouns"], + identifier_case="camel", + input_file=Path(mock_input_file), + output_file=Path("scribe_data_sqlite_export"), + overwrite=True, + ) + + @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) + def test_cli_convert_wrapper_german_wiktionary_translations_sqlite( + self, mock_data_to_sqlite: MagicMock + ) -> None: + convert_wrapper( + languages=["german"], + data_types=["wiktionary_translations"], + input_path=Path("/input"), + output_type="sqlite", + overwrite=False, + identifier_case="camel", + ) + + mock_data_to_sqlite.assert_called_once_with( + languages=["german"], + specific_tables=["wiktionary_translations"], + identifier_case="camel", + input_file=Path("/input"), + output_file=Path("/output"), + overwrite=False, + ) + + @patch( + "scribe_data.cli.convert.DEFAULT_WIKTIONARY_JSON_EXPORT_DIR", + new=Path("/mock_wiktionary_dir"), + ) + @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) + def test_cli_convert_wrapper_wiktionary_no_input_path_uses_wiktionary_default( + self, mock_data_to_sqlite: MagicMock + ) -> None: + convert_wrapper( + languages=["german"], + data_types=["wiktionary_translations"], + input_path=None, + output_type="sqlite", + overwrite=False, + ) + + mock_data_to_sqlite.assert_called_once_with( + languages=["german"], + specific_tables=["wiktionary_translations"], + identifier_case="camel", + input_file=Path("/mock_wiktionary_dir"), + output_file=Path("/output"), + overwrite=False, + ) + + def test_cli_convert_wrapper_no_output_type(self) -> None: + with self.assertRaises(ValueError) as context: + convert_wrapper( + languages=["English"], + data_types=["nouns"], + input_path=Path("Data/encode.csv"), + output_type="parquet", + overwrite=True, + ) + + self.assertEqual( + str(context.exception), + "Unsupported output type 'parquet'. Must be 'json', 'csv', 'tsv' or 'sqlite'.", + ) diff --git a/tests/cli/test_download.py b/tests/cli/download/test_cli_download_wikidata_lexeme_dump.py similarity index 86% rename from tests/cli/test_download.py rename to tests/cli/download/test_cli_download_wikidata_lexeme_dump.py index 5347a3a9f..8179270c1 100644 --- a/tests/cli/test_download.py +++ b/tests/cli/download/test_cli_download_wikidata_lexeme_dump.py @@ -10,8 +10,8 @@ import requests -from scribe_data.cli.download import ( - available_closest_lexeme_dumpfile, +from scribe_data.cli.download.wikidata_lexeme_dump import ( + available_closest_lexeme_dump_file, download_wd_lexeme_dump, parse_date, wd_lexeme_dump_download_wrapper, @@ -35,7 +35,7 @@ def test_parse_date_invalid_format(self) -> None: self.assertIsNone(parse_date("99-16-77")) self.assertIsNone(parse_date("invalid-date")) - @patch("scribe_data.cli.download.requests.get") + @patch("requests.get") def test_available_closest_lexeme_dumpfile(self, mock_get: MagicMock) -> None: """ Test finding closest available lexeme dump file. @@ -50,14 +50,14 @@ def test_available_closest_lexeme_dumpfile(self, mock_get: MagicMock) -> None: ) target_date = "20240103" other_old_dumps = ["20240101", "20240105", "20240110"] - closest = available_closest_lexeme_dumpfile( + closest = available_closest_lexeme_dump_file( target_date, other_old_dumps, mock_check_func ) self.assertEqual(closest, "20240101") - @patch("scribe_data.cli.download.requests.get") - @patch("scribe_data.cli.download.re.findall") - def test_download_wd_lexeme_dump_latest( + @patch("requests.get") + @patch("re.findall") + def test_cli_download_wd_lexeme_dump_latest( self, mock_findall: MagicMock, mock_get: MagicMock ) -> None: """ @@ -72,9 +72,9 @@ def test_download_wd_lexeme_dump_latest( "https://dumps.wikimedia.org/wikidatawiki/entities/latest-lexemes.json.bz2", ) - @patch("scribe_data.cli.download.requests.get") - @patch("scribe_data.cli.download.re.findall") - def test_download_wd_lexeme_dump_by_date( + @patch("requests.get") + @patch("re.findall") + def test_cli_download_wd_lexeme_dump_by_date( self, mock_findall: MagicMock, mock_get: MagicMock ) -> None: """ @@ -89,15 +89,15 @@ def test_download_wd_lexeme_dump_by_date( "https://dumps.wikimedia.org/wikidatawiki/entities/20241127/wikidata-20241127-lexemes.json.bz2", ) - @patch("scribe_data.cli.download.requests.get") + @patch("requests.get") @patch( "scribe_data.cli.download.check_lexeme_dump_prompt_download", return_value=False ) @patch("scribe_data.cli.download.open", new_callable=mock_open) @patch("scribe_data.cli.download.tqdm") @patch("scribe_data.cli.download.os.makedirs") - @patch("scribe_data.cli.download.questionary.confirm") - def test_wd_lexeme_dump_download_wrapper_latest( + @patch("questionary.confirm") + def test_cli_download_wd_lexeme_dump_wrapper_latest( self, mock_confirm: MagicMock, mock_makedirs: MagicMock, @@ -134,7 +134,7 @@ def test_wd_lexeme_dump_download_wrapper_latest( "scribe_data.utils.Path.glob", return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")], ) - def test_check_lexeme_dump_prompt_download_existing( + def test_cli_download_check_lexeme_dump_prompt_download_existing( self, mock_glob: MagicMock, mock_select: MagicMock ) -> None: """ @@ -153,7 +153,7 @@ def test_check_lexeme_dump_prompt_download_existing( "scribe_data.utils.Path.glob", return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")], ) - def test_check_lexeme_dump_prompt_download_delete( + def test_cli_download_check_lexeme_dump_prompt_download_delete( self, mock_glob: MagicMock, mock_select: MagicMock ) -> None: """ @@ -172,9 +172,9 @@ def test_check_lexeme_dump_prompt_download_delete( self.assertTrue(mock_unlink.called) self.assertTrue(result) - @patch("scribe_data.cli.download.requests.get") - @patch("scribe_data.cli.download.questionary.confirm") - def test_download_wd_lexeme_dump_http_error( + @patch("requests.get") + @patch("questionary.confirm") + def test_cli_download_wd_lexeme_dump_http_error( self, mock_confirm: MagicMock, mock_get: MagicMock ) -> None: """ @@ -199,8 +199,8 @@ def test_download_wd_lexeme_dump_http_error( "We could not find your requested Wikidata lexeme dump." ) - @patch("scribe_data.cli.download.requests.get") - def test_download_wd_lexeme_dump_request_exception( + @patch("requests.get") + def test_cli_download_wd_lexeme_dump_request_exception( self, mock_get: MagicMock ) -> None: """ @@ -213,9 +213,9 @@ def test_download_wd_lexeme_dump_request_exception( self.assertIsNone(result) mock_print.assert_called_with("An error occurred: Connection error") - @patch("scribe_data.cli.download.requests.get") - @patch("scribe_data.cli.download.questionary.confirm") - def test_download_wd_lexeme_dump_find_closest( + @patch("requests.get") + @patch("questionary.confirm") + def test_cli_download_wd_lexeme_dump_find_closest( self, mock_confirm: MagicMock, mock_get: MagicMock ) -> None: """ @@ -244,9 +244,9 @@ def test_download_wd_lexeme_dump_find_closest( self.assertIsNotNone(result) self.assertIn("20240101", result) - @patch("scribe_data.cli.download.requests.get") - @patch("scribe_data.cli.download.questionary.confirm") - def test_download_wd_lexeme_dump_user_declines_closest( + @patch("requests.get") + @patch("questionary.confirm") + def test_cli_download_wd_lexeme_dump_user_declines_closest( self, mock_confirm: MagicMock, mock_get: MagicMock ) -> None: """ @@ -265,7 +265,7 @@ def test_download_wd_lexeme_dump_user_declines_closest( result = download_wd_lexeme_dump("2024-01-01") self.assertIsNone(result) - def test_wd_lexeme_dump_download_wrapper_default_flag(self) -> None: + def test_cli_download_wd_lexeme_dump_download_wrapper_default_flag(self) -> None: """ Test wrapper function with default flag set to True. """ @@ -275,8 +275,10 @@ def test_wd_lexeme_dump_download_wrapper_default_flag(self) -> None: result = wd_lexeme_dump_download_wrapper(default=True) self.assertFalse(result) - @patch("scribe_data.cli.download.requests.get") - def test_download_wd_lexeme_dump_invalid_date(self, mock_get: MagicMock) -> None: + @patch("requests.get") + def test_cli_download_wd_lexeme_dump_invalid_date( + self, mock_get: MagicMock + ) -> None: """ Test downloading with invalid date format. """ diff --git a/tests/cli/interactive/test_cli_interactive_config.py b/tests/cli/interactive/test_cli_interactive_config.py new file mode 100644 index 000000000..5997e0a81 --- /dev/null +++ b/tests/cli/interactive/test_cli_interactive_config.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI interactive mode configuration functionality. +""" + +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +from scribe_data.cli.interactive.config import ScribeDataConfig +from scribe_data.cli.interactive.run import configure_settings + + +class TestScribeDataCLIInteractiveConfig(unittest.TestCase): + def setUp(self) -> None: + """ + Set up test fixtures before each test method. + """ + self.config = ScribeDataConfig() + # Mock the language_metadata and data_type_metadata. + self.config.languages = ["english", "spanish", "french"] + self.config.data_types = ["nouns", "verbs"] + + def test_cli_interactive_config_initialization(self) -> None: + """ + Test ScribeDataConfig initialization. + """ + self.assertEqual(self.config.selected_languages, []) + self.assertEqual(self.config.selected_data_types, []) + self.assertEqual(self.config.output_type, "json") + self.assertIsInstance(self.config.output_dir, Path) + self.assertFalse(self.config.overwrite) + self.assertFalse(self.config.configured) + + @patch("prompt_toolkit.prompt") + @patch("rich.print") + def test_cli_interactive_configure_settings_all_languages( + self, mock_rprint: MagicMock, mock_prompt: MagicMock + ) -> None: + """ + Test configure_settings with 'All' languages selection. + """ + # Set up mock responses. + responses = iter( + [ + "All", # languages + "nouns", # data types + "json", # output type + "", # output directory (default) + "y", # overwrite + ] + ) + mock_prompt.side_effect = lambda *args, **kwargs: next(responses) + + with patch("scribe_data.cli.interactive.config", self.config): + with patch("scribe_data.cli.interactive.display_summary"): + configure_settings() + + self.assertEqual(self.config.selected_languages, self.config.languages) + self.assertEqual(self.config.selected_data_types, ["nouns"]) + self.assertEqual(self.config.output_type, "json") + self.assertTrue(self.config.configured) + + @patch("prompt_toolkit.prompt") + @patch("rich.print") + def test_cli_interactive_configure_settings_specific_languages( + self, mock_rprint: MagicMock, mock_prompt: MagicMock + ) -> None: + """ + Test configure_settings with specific language selection. + """ + # Set up mock responses. + responses = iter( + [ + "english, spanish", # languages + "nouns, verbs", # data types + "csv", # output type + "/custom/path", # output directory + "n", # overwrite + ] + ) + mock_prompt.side_effect = lambda *args, **kwargs: next(responses) + + with patch("scribe_data.cli.interactive.config", self.config): + with patch("scribe_data.cli.interactive.display_summary"): + configure_settings() + + self.assertEqual(self.config.selected_languages, ["english", "spanish"]) + self.assertEqual(self.config.selected_data_types, ["nouns", "verbs"]) + self.assertEqual(self.config.output_type, "csv") + self.assertEqual(self.config.output_dir.as_posix(), "/custom/path") + self.assertFalse(self.config.overwrite) diff --git a/tests/cli/interactive/test_cli_interactive_execute.py b/tests/cli/interactive/test_cli_interactive_execute.py new file mode 100644 index 000000000..63e7ae146 --- /dev/null +++ b/tests/cli/interactive/test_cli_interactive_execute.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI interactive mode execution functionality. +""" + +import unittest +from unittest.mock import MagicMock, patch + +from scribe_data.cli.interactive.config import ScribeDataConfig +from scribe_data.cli.interactive.execute import ( + display_summary, + execute_request, +) + + +class TestScribeDataCLIInteractiveExecute(unittest.TestCase): + def setUp(self) -> None: + """ + Set up test fixtures before each test method. + """ + self.config = ScribeDataConfig() + # Mock the language_metadata and data_type_metadata. + self.config.languages = ["english", "spanish", "french"] + self.config.data_types = ["nouns", "verbs"] + + @patch("scribe_data.cli.get.get_data") + @patch("tqdm.tqdm") + @patch("scribe_data.cli.interactive.execute.logger") + def test_cli_interactive_execute_request( + self, mock_logger: MagicMock, mock_tqdm: MagicMock, mock_get_data: MagicMock + ) -> None: + """ + Test execute_request functionality. + """ + self.config.selected_languages = ["english"] + self.config.selected_data_types = ["nouns"] + self.config.configured = True + + mock_get_data.return_value = True + mock_progress = MagicMock() + mock_tqdm.return_value.__enter__.return_value = mock_progress + + with patch("scribe_data.cli.interactive.config", self.config): + execute_request() + + mock_get_data.assert_called_once_with( + languages=["english"], + data_types=["nouns"], + output_type=self.config.output_type, + output_dir=self.config.output_dir, + overwrite=self.config.overwrite, + interactive=True, + ) + + @patch("rich.console.Console.print") + def test_cli_interactive_display_summary(self, mock_print: MagicMock) -> None: + """ + Test display_summary functionality. + """ + self.config.selected_languages = ["english"] + self.config.selected_data_types = ["nouns"] + self.config.output_type = "json" + + with patch("scribe_data.cli.interactive.config", self.config): + display_summary() + mock_print.assert_called() diff --git a/tests/cli/interactive/test_cli_interactive_prompt.py b/tests/cli/interactive/test_cli_interactive_prompt.py new file mode 100644 index 000000000..bd1697f2e --- /dev/null +++ b/tests/cli/interactive/test_cli_interactive_prompt.py @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI interactive mode prompt functionality. +""" + +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, call, patch + +from prompt_toolkit.completion import WordCompleter + +from scribe_data.cli.interactive.config import ScribeDataConfig +from scribe_data.cli.interactive.prompt import ( + create_word_completer, + prompt_for_data_types, + prompt_for_languages, + resolve_wiktionary_dump_path, +) + + +class TestScribeDataCLIInteractivePrompt(unittest.TestCase): + def setUp(self) -> None: + """ + Set up test fixtures before each test method. + """ + self.config = ScribeDataConfig() + # Mock the language_metadata and data_type_metadata. + self.config.languages = ["english", "spanish", "french"] + self.config.data_types = ["nouns", "verbs"] + + @patch("prompt_toolkit.prompt") + @patch("rich.print") + def test_cli_interactive_request_total_lexeme( + self, mock_rprint: MagicMock, mock_prompt: MagicMock + ) -> None: + """ + Test request_total_lexeme functionality. + """ + # Set up mock responses. + mock_prompt.side_effect = [ + "english, french", # first call for languages + "nouns", # first call for data types + ] + + with patch("scribe_data.cli.interactive.config", self.config): + with patch( + "scribe_data.cli.interactive.list_all_languages", + return_value=["english", "french"], + ): + prompt_for_languages() + prompt_for_data_types() + + # Verify the config was updated correctly. + self.assertEqual(self.config.selected_languages, ["english", "french"]) + self.assertEqual(self.config.selected_data_types, ["nouns"]) + + # Verify prompt was called with correct arguments. + expected_calls = [ + call( + "Select languages (comma-separated or 'All'): ", + completer=unittest.mock.ANY, + default="", + ), + call( + "Select data types (comma-separated or 'All'): ", + completer=unittest.mock.ANY, + default="", + ), + ] + mock_prompt.assert_has_calls(expected_calls, any_order=False) + + def test_cli_interactive_resolve_wiktionary_dump_path_from_subdirectory( + self, + ) -> None: + """ + Find dumps when cwd is not the project root. + """ + with patch("os.getcwd") as mock_getcwd: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + dump_dir = root / "scribe_data_wiktionary_dumps_export" + json_dir = root / "scribe_data_json_export" + dump_dir.mkdir() + json_dir.mkdir() + dump_file = dump_dir / "dewiktionary-pages-articles.xml.bz2" + dump_file.write_bytes(b"x") + + mock_getcwd.return_value = str(json_dir) + resolved = resolve_wiktionary_dump_path( + "german", "scribe_data_wiktionary_dumps_export" + ) + + self.assertEqual(resolved, dump_file.resolve()) + + def test_cli_interactive_create_word_completer(self) -> None: + """ + Test create_word_completer functionality. + """ + # Test without 'All' option. + options = ["english", "spanish", "french"] + completer = create_word_completer(options, include_all=False) + self.assertIsInstance(completer, WordCompleter) + self.assertEqual(completer.words, options) + + # Test with 'All' option. + completer_with_all = create_word_completer(options, include_all=True) + self.assertEqual(completer_with_all.words, ["All"] + options) diff --git a/tests/cli/interactive/test_cli_interactive_run.py b/tests/cli/interactive/test_cli_interactive_run.py new file mode 100644 index 000000000..60df8346e --- /dev/null +++ b/tests/cli/interactive/test_cli_interactive_run.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI interactive mode runner functionality. +""" + +import unittest +from pathlib import Path +from unittest.mock import patch + +from scribe_data.cli.interactive.config import ScribeDataConfig +from scribe_data.cli.interactive.run import run_interactive_mode + + +class TestScribeDataCLIInteractiveRun(unittest.TestCase): + def setUp(self) -> None: + """ + Set up test fixtures before each test method. + """ + self.config = ScribeDataConfig() + # Mock the language_metadata and data_type_metadata. + self.config.languages = ["english", "spanish", "french"] + self.config.data_types = ["nouns", "verbs"] + + @patch( + "scribe_data.cli.interactive.prompt.resolve_wiktionary_dump_path", + return_value=Path("/dump/path"), + ) + @patch("scribe_data.wiktionary.parse_translations.parse_wiktionary_translations") + @patch("prompt_toolkit.prompt") + @patch("scribe_data.cli.interactive.execute.prompt_for_languages") + @patch("questionary.select") + def test_cli_interactive_run_translations( + self, + mock_select, + mock_prompt_languages, + mock_prompt, + mock_parse_wiktionary, + mock_resolve_dump, + ): + mock_select.return_value.ask.side_effect = ["translations"] + mock_prompt.side_effect = [ + "german", + "/dump/path", + "scribe_data_wiktionary_json_export", + "false", + ] + self.config.selected_languages = ["english"] + + run_interactive_mode(operation="translations") + + mock_parse_wiktionary.assert_called_once_with( + target_languages=["english"], + wiktionary_dump_path=Path("/dump/path"), + output_dir=Path("scribe_data_wiktionary_json_export"), + overwrite=False, + ) diff --git a/tests/cli/list/test_cli_list_data_types.py b/tests/cli/list/test_cli_list_data_types.py new file mode 100644 index 000000000..86d539837 --- /dev/null +++ b/tests/cli/list/test_cli_list_data_types.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI list data types functionality. +""" + +import unittest +from unittest.mock import MagicMock, call, patch + +from scribe_data.cli.list.data_types import list_data_types +from scribe_data.cli.main import main + + +class TestCLIListDataTypes(unittest.TestCase): + @patch("builtins.print") + def test_cli_list_data_types_all_languages(self, mock_print: MagicMock) -> None: + list_data_types() + print(mock_print.mock_calls) + expected_calls = [ + call(), + call("Available data types: All languages"), + call("==================================="), + call("adjectives"), + call("adverbs"), + # call("articles"), + call("conjunctions"), + call("emoji-keywords"), + call("nouns"), + call("personal-pronouns"), + call("postpositions"), + call("prepositions"), + call("pronouns"), + call("proper-nouns"), + call("verbs"), + call(), + ] + mock_print.assert_has_calls(expected_calls) + + @patch("builtins.print") + def test_cli_list_data_types_specific_language(self, mock_print: MagicMock) -> None: + list_data_types("english") + + expected_calls = [ + call(), + call("Available data types: English"), + call("============================="), + call("adjectives"), + call("adverbs"), + call("emoji-keywords"), + call("nouns"), + call("personal-pronouns"), + call("prepositions"), + call("pronouns"), + call("proper-nouns"), + call("verbs"), + call(), + ] + mock_print.assert_has_calls(expected_calls) + + def test_cli_list_data_types_invalid_language(self) -> None: + with self.assertRaises(ValueError): + list_data_types("InvalidLanguage") + + def test_cli_list_data_types_no_data_types(self) -> None: + with self.assertRaises(ValueError): + list_data_types("Klingon") + + @patch("scribe_data.cli.list.data_types.list_data_types") + def test_cli_list_data_types_command(self, mock_list_data_types: MagicMock) -> None: + test_args = ["main.py", "list", "--data-type"] + with patch("sys.argv", test_args): + main() + + mock_list_data_types.assert_called_once() diff --git a/tests/cli/list/test_cli_list_languages.py b/tests/cli/list/test_cli_list_languages.py new file mode 100644 index 000000000..7dec7c2cc --- /dev/null +++ b/tests/cli/list/test_cli_list_languages.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI list languages functionality. +""" + +import unittest +from unittest.mock import MagicMock, patch + +from scribe_data.cli.list.languages import list_languages, list_languages_for_data_type +from scribe_data.cli.main import main +from scribe_data.utils import ( + get_language_iso, + get_language_qid, + list_all_languages, + list_languages_with_metadata_for_data_type, +) + + +class TestCLIListLanguages(unittest.TestCase): + @patch("builtins.print") + def test_cli_list_languages(self, mock_print: MagicMock) -> None: + list_languages() + + # Verify the headers. + mock_print.assert_any_call("\nLanguage ISO QID ") + mock_print.assert_any_call("=================================") + + # Dynamically get the first language from the metadata. + languages = list_all_languages() + first_language = languages[0] + first_iso = get_language_iso(first_language) + first_qid = get_language_qid(first_language) + + # Verify the first language entry. + # Calculate column widths as in the actual function. + language_col_width = max(len(lang) for lang in languages) + 2 + iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2 + qid_col_width = max(len(get_language_qid(lang)) for lang in languages) + 2 + + # Verify the first language entry with dynamic spacing. + mock_print.assert_any_call( + f"{first_language.capitalize():<{language_col_width}} {first_iso:<{iso_col_width}} {first_qid:<{qid_col_width}}" + ) + # Total print calls: N (languages) + 3 (header, one separator, final line). + self.assertEqual(mock_print.call_count, len(languages) + 3) + + @patch("builtins.print") + def test_cli_list_languages_for_data_type_valid( + self, mock_print: MagicMock + ) -> None: + # Call the function with a specific data type. + list_languages_for_data_type("nouns") + + # Dynamically create the header based on column widths. + all_languages = list_languages_with_metadata_for_data_type() + + # Calculate column widths as in the actual function. + language_col_width = max(len(lang["name"]) for lang in all_languages) + 2 + iso_col_width = max(len(lang["iso"]) for lang in all_languages) + 2 + qid_col_width = max(len(lang["qid"]) for lang in all_languages) + 2 + + # Dynamically generate the expected header string. + expected_header = f"{'\nLanguage':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}" + + # Verify the headers dynamically. + mock_print.assert_any_call(expected_header) + mock_print.assert_any_call( + "=" * (language_col_width + iso_col_width + qid_col_width) + ) + + # Verify the first language entry if there are any languages. + + first_language = all_languages[0]["name"].capitalize() + first_iso = all_languages[0]["iso"] + first_qid = all_languages[0]["qid"] + + # Verify the first language entry with dynamic spacing. + mock_print.assert_any_call( + f"{first_language:<{language_col_width}} {first_iso:<{iso_col_width}} {first_qid:<{qid_col_width}}" + ) + + # Check the total number of calls. + # Total calls = N (languages) + 3 (header, one separator, final line). + expected_calls = len(all_languages) + 3 + self.assertEqual(mock_print.call_count, expected_calls) + + @patch("scribe_data.cli.list.languages.list_languages") + def test_cli_list_languages_command(self, mock_list_languages: MagicMock) -> None: + test_args = ["main.py", "list", "--language"] + with patch("sys.argv", test_args): + main() + + mock_list_languages.assert_called_once() diff --git a/tests/cli/list/test_cli_list_wrapper.py b/tests/cli/list/test_cli_list_wrapper.py new file mode 100644 index 000000000..fbf4e68c2 --- /dev/null +++ b/tests/cli/list/test_cli_list_wrapper.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI list wrapper functionality. +""" + +import unittest +from unittest.mock import MagicMock, patch + +from scribe_data.cli.list.wrapper import ( + list_all, + list_wrapper, +) +from scribe_data.cli.main import main + + +class TestCLIListWrapper(unittest.TestCase): + @patch("scribe_data.cli.list.languages.list_languages") + @patch("scribe_data.cli.list.data_types.list_data_types") + def test_cli_list_all( + self, mock_list_data_types: MagicMock, mock_list_languages: MagicMock + ) -> None: + list_all() + mock_list_languages.assert_called_once() + mock_list_data_types.assert_called_once() + + @patch("scribe_data.cli.list.wrapper.list_all") + def test_cli_list_wrapper_all(self, mock_list_all: MagicMock) -> None: + list_wrapper(all_bool=True) + mock_list_all.assert_called_once() + + @patch("scribe_data.cli.list.languages.list_languages") + def test_cli_list_wrapper_languages(self, mock_list_languages: MagicMock) -> None: + list_wrapper(language=True) + mock_list_languages.assert_called_once() + + @patch("scribe_data.cli.list.data_types.list_data_types") + def test_cli_list_wrapper_data_types(self, mock_list_data_types: MagicMock) -> None: + list_wrapper(data_type=True) + mock_list_data_types.assert_called_once() + + @patch("builtins.print") + def test_cli_list_wrapper_language_and_data_type( + self, mock_print: MagicMock + ) -> None: + list_wrapper(language=True, data_type=True) + mock_print.assert_called_with( + "Please specify either a language or a data type." + ) + + @patch("scribe_data.cli.list.languages.list_languages_for_data_type") + def test_cli_list_wrapper_languages_for_data_type( + self, mock_list_languages_for_data_type: MagicMock + ) -> None: + list_wrapper(language=True, data_type="example_data_type") + mock_list_languages_for_data_type.assert_called_with("example_data_type") + + @patch("scribe_data.cli.list.data_types.list_data_types") + def test_cli_list_wrapper_data_types_for_language( + self, mock_list_data_types: MagicMock + ) -> None: + list_wrapper(language="English", data_type=True) + mock_list_data_types.assert_called_with("English") + + @patch("scribe_data.cli.list.wrapper.list_all") + def test_cli_list_all_command(self, mock_list_all: MagicMock) -> None: + test_args = ["main.py", "list", "--all"] + with patch("sys.argv", test_args): + main() + + mock_list_all.assert_called_once() diff --git a/tests/cli/test_get.py b/tests/cli/test_cli_get.py similarity index 79% rename from tests/cli/test_get.py rename to tests/cli/test_cli_get.py index b1faa6732..47f5c542f 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_cli_get.py @@ -18,11 +18,10 @@ DEFAULT_JSON_EXPORT_DIR, DEFAULT_SQLITE_EXPORT_DIR, DEFAULT_TSV_EXPORT_DIR, - DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, ) -class TestGetData(unittest.TestCase): +class TestCLIGetData(unittest.TestCase): """ Unit tests for the get_data function. @@ -32,25 +31,18 @@ class TestGetData(unittest.TestCase): # MARK: Subprocess Patching @patch("scribe_data.cli.get.generate_emoji") - def test_get_emoji_keywords(self, generate_emoji: MagicMock) -> None: + def test_cli_get_emoji_keywords(self, generate_emoji: MagicMock) -> None: """ Test the generation of emoji keywords. This test ensures that when thee `data_type` is `emoji_keywords`, the `generate_emoji` function is called with the correct arguments. """ - get_data( - languages=["English"], - data_types=["emoji_keywords"], - output_dir=Path("./test_output"), - ) - generate_emoji.assert_called_once_with( - language="English", - output_dir=Path("./test_output"), - ) + get_data(languages=["English"], data_types=["emoji_keywords"]) + generate_emoji.assert_called_once_with(language="English") # MARK: Invalid Arguments - def test_invalid_arguments(self) -> None: + def test_cli_get_invalid_arguments(self) -> None: """ Test the behavior of the get_data function when invalid arguments are provided. """ @@ -61,8 +53,8 @@ def test_invalid_arguments(self) -> None: @patch("scribe_data.cli.get.query_data") @patch("scribe_data.cli.get.parse_wd_lexeme_dump") - @patch("scribe_data.cli.get.questionary.confirm") - def test_get_all_data_types_for_language_user_says_no( + @patch("questionary.confirm") + def test_cli_get_all_data_types_for_language_user_says_no( self, mock_questionary_confirm: MagicMock, mock_parse: MagicMock, @@ -82,14 +74,13 @@ def test_get_all_data_types_for_language_user_says_no( languages=["English"], data_types=["all"], wikidata_dump_type=["form"], - output_dir=DEFAULT_JSON_EXPORT_DIR, wikidata_dump_path=None, # explicitly set to None overwrite_all=False, ) mock_query_data.assert_not_called() @patch("scribe_data.cli.get.parse_wd_lexeme_dump") - def test_get_all_languages_and_data_types(self, mock_parse: MagicMock) -> None: + def test_cli_get_all_languages_and_data_types(self, mock_parse: MagicMock) -> None: """ Test retrieving all languages for a specific data type. @@ -101,7 +92,6 @@ def test_get_all_languages_and_data_types(self, mock_parse: MagicMock) -> None: languages=["all"], data_types=["all"], wikidata_dump_type=["form", "translations"], - output_dir=DEFAULT_JSON_EXPORT_DIR, wikidata_dump_path=None, overwrite_all=False, ) @@ -109,7 +99,7 @@ def test_get_all_languages_and_data_types(self, mock_parse: MagicMock) -> None: # MARK: Language and Data Type @patch("scribe_data.cli.get.query_data") - def test_get_specific_language_and_data_type( + def test_cli_get_specific_language_and_data_type( self, mock_query_data: MagicMock ) -> None: """ @@ -117,13 +107,10 @@ def test_get_specific_language_and_data_type( Ensures that `query_data` is called properly when a specific language and data type are provided. """ - get_data( - languages=["german"], data_types=["nouns"], output_dir=Path("./test_output") - ) + get_data(languages=["german"], data_types=["nouns"]) mock_query_data.assert_called_once_with( languages=["german"], data_types=["nouns"], - output_dir=Path("./test_output"), overwrite=False, interactive=False, ) @@ -133,7 +120,7 @@ def test_get_specific_language_and_data_type( @patch("scribe_data.cli.get.query_data") @patch("scribe_data.cli.get.Path.glob", return_value=[]) @patch("scribe_data.cli.get.check_index_exists") - def test_get_data_with_capitalized_language( + def test_cli_get_data_with_capitalized_language( self, mock_check_index: MagicMock, mock_glob: MagicMock, @@ -149,7 +136,6 @@ def test_get_data_with_capitalized_language( mock_query_data.assert_called_once_with( languages=["German"], data_types=["nouns"], - output_dir=DEFAULT_JSON_EXPORT_DIR, overwrite=False, interactive=False, ) @@ -159,7 +145,7 @@ def test_get_data_with_capitalized_language( @patch("scribe_data.cli.get.query_data") @patch("scribe_data.cli.get.Path.glob", return_value=[]) @patch("scribe_data.cli.get.check_index_exists", return_value=False) - def test_get_data_with_lowercase_language( + def test_cli_get_data_with_lowercase_language( self, mock_check_index: MagicMock, mock_glob: MagicMock, @@ -174,7 +160,6 @@ def test_get_data_with_lowercase_language( mock_query_data.assert_called_once_with( languages=["german"], data_types=["nouns"], - output_dir=DEFAULT_JSON_EXPORT_DIR, overwrite=False, interactive=False, ) @@ -182,7 +167,7 @@ def test_get_data_with_lowercase_language( # MARK: Output Directory @patch("scribe_data.cli.get.query_data") - def test_get_data_with_different_output_directory( + def test_cli_get_data_with_different_output_directory( self, mock_query_data: MagicMock ) -> None: """ @@ -190,15 +175,10 @@ def test_get_data_with_different_output_directory( Ensures that `query_data` is called properly when a different output directory is provided. """ - get_data( - languages=["german"], - data_types=["nouns"], - output_dir=Path("./custom_output_test"), - ) + get_data(languages=["german"], data_types=["nouns"]) mock_query_data.assert_called_once_with( languages=["german"], data_types=["nouns"], - output_dir=Path("./custom_output_test"), overwrite=False, interactive=False, ) @@ -207,7 +187,7 @@ def test_get_data_with_different_output_directory( @patch("scribe_data.cli.get.query_data") @patch("scribe_data.cli.get.Path.glob", return_value=[]) - def test_get_data_with_overwrite_true( + def test_cli_get_data_with_overwrite_true( self, mock_glob: MagicMock, mock_query_data: MagicMock ) -> None: """ @@ -219,7 +199,6 @@ def test_get_data_with_overwrite_true( mock_query_data.assert_called_once_with( languages=["English"], data_types=["verbs"], - output_dir=DEFAULT_JSON_EXPORT_DIR, overwrite=True, interactive=False, ) @@ -227,18 +206,18 @@ def test_get_data_with_overwrite_true( # MARK: Overwrite is False @patch("scribe_data.cli.get.query_data") - def test_get_data_with_overwrite_false(self, mock_query_data: MagicMock) -> None: + def test_cli_get_data_with_overwrite_false( + self, mock_query_data: MagicMock + ) -> None: get_data( languages=["English"], data_types=["verbs"], overwrite=False, - output_dir=Path("./custom_output_test"), interactive=False, ) mock_query_data.assert_called_once_with( languages=["English"], data_types=["verbs"], - output_dir=Path("./custom_output_test"), overwrite=False, interactive=False, ) @@ -250,9 +229,9 @@ def test_get_data_with_overwrite_false(self, mock_query_data: MagicMock) -> None "scribe_data.cli.get.Path.glob", return_value=[Path("./test_output/English/nouns.json")], ) - @patch("scribe_data.cli.get.questionary.confirm") + @patch("questionary.confirm") @patch("scribe_data.cli.get.check_index_exists") - def test_user_skips_existing_file( + def test_cli_get_user_skips_existing_file( self, mock_check_index: MagicMock, mock_questionary_confirm: MagicMock, @@ -267,11 +246,7 @@ def test_user_skips_existing_file( mock_questionary_confirm.return_value.ask.return_value = False mock_check_index.return_value = True - result = get_data( - languages=["English"], - data_types=["nouns"], - output_dir=Path("./test_output"), - ) + result = get_data(languages=["English"], data_types=["nouns"]) # Validate the skip result. self.assertEqual(result, {"success": False, "skipped": True}) @@ -284,8 +259,8 @@ def test_user_skips_existing_file( "scribe_data.cli.get.Path.glob", return_value=[Path("./test_output/English/nouns.json")], ) - @patch("scribe_data.cli.get.questionary.confirm") - def test_user_overwrites_existing_file( + @patch("questionary.confirm") + def test_cli_get_user_overwrites_existing_file( self, mock_questionary_confirm: MagicMock, mock_glob: MagicMock, @@ -297,15 +272,10 @@ def test_user_overwrites_existing_file( Ensures that the file is overwritten and the function returns the correct result. """ mock_questionary_confirm.return_value.ask.return_value = True - get_data( - languages=["English"], - data_types=["nouns"], - output_dir=Path("./test_output"), - ) + get_data(languages=["English"], data_types=["nouns"]) mock_query_data.assert_called_once_with( languages=["English"], data_types=["nouns"], - output_dir=Path("./test_output"), overwrite=False, interactive=False, ) @@ -313,48 +283,35 @@ def test_user_overwrites_existing_file( # MARK: Translations @patch("scribe_data.wiktionary.parse_translations.parse_wiktionary_translations") - def test_get_translations_no_language_specified(self, mock_parse): + def test_cli_get_translations_no_language_specified(self, mock_parse): get_data(data_types=["translations"]) mock_parse.assert_called_once_with( target_languages=None, wiktionary_dump_path=None, - output_dir=DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, overwrite=False, ) @patch("scribe_data.wiktionary.parse_translations.parse_wiktionary_translations") - def test_get_translations_with_specific_language(self, mock_parse): - get_data( - languages=["Spanish"], - data_types=["translations"], - output_dir=Path("./test_output"), - ) + def test_cli_get_translations_with_specific_language(self, mock_parse): + get_data(languages=["Spanish"], data_types=["translations"]) mock_parse.assert_called_once_with( - target_languages=["Spanish"], - wiktionary_dump_path=None, - output_dir=Path("./test_output"), - overwrite=False, + target_languages=["Spanish"], wiktionary_dump_path=None, overwrite=False ) @patch("scribe_data.wiktionary.parse_translations.parse_wiktionary_translations") - def test_get_translations_with_dump(self, mock_parse): - get_data( - languages=["German"], - data_types=["translations"], - wiktionary_dump=Path("./wikidump.json"), - ) + def test_cli_get_translations_with_dump(self, mock_parse): + get_data(languages=["German"], data_types=["translations"]) mock_parse.assert_called_once_with( target_languages=["German"], wiktionary_dump_path=Path("./wikidump.json"), - output_dir=DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, overwrite=False, ) # MARK: Use QID as language @patch("scribe_data.cli.get.parse_wd_lexeme_dump") - @patch("scribe_data.cli.get.questionary.confirm") - def test_get_data_with_wikidata_identifier( + @patch("questionary.confirm") + def test_cli_get_data_with_wikidata_identifier( self, mock_questionary_confirm: MagicMock, mock_parse: MagicMock ) -> None: """ @@ -368,7 +325,6 @@ def test_get_data_with_wikidata_identifier( get_data( languages=["Q9217"], - output_dir=Path("exported_json"), wikidata_dump_path=Path("scribe"), all_bool=True, ) @@ -376,13 +332,12 @@ def test_get_data_with_wikidata_identifier( languages=["Q9217"], data_types=["all"], wikidata_dump_type=["form"], - output_dir=Path("exported_json"), wikidata_dump_path=Path("scribe"), overwrite_all=False, ) @patch("scribe_data.cli.get.parse_wd_lexeme_dump") - def test_get_data_with_wikidata_identifier_and_data_type( + def test_cli_get_data_with_wikidata_identifier_and_data_type( self, mock_parse: MagicMock ) -> None: """ @@ -394,22 +349,20 @@ def test_get_data_with_wikidata_identifier_and_data_type( get_data( languages=["Q9217"], data_types=["nouns"], - output_dir=Path("exported_json"), wikidata_dump_path=Path("scribe"), ) mock_parse.assert_called_once_with( languages=["Q9217"], wikidata_dump_type=["form"], data_types=["nouns"], - output_dir=Path("exported_json"), wikidata_dump_path=Path("scribe"), overwrite_all=False, ) # MARK: All Languages for Data Type @patch("scribe_data.cli.get.parse_wd_lexeme_dump") - @patch("scribe_data.cli.get.questionary.confirm") - def test_get_all_languages_for_data_type_user_says_no( + @patch("questionary.confirm") + def test_cli_get_all_languages_for_data_type_user_says_no( self, mock_questionary_confirm: MagicMock, mock_parse: MagicMock ) -> None: """ @@ -421,19 +374,18 @@ def test_get_all_languages_for_data_type_user_says_no( # Mock user choosing to use lexeme dump instead of querying Wikidata. mock_questionary_confirm.return_value.ask.return_value = False - get_data(all_bool=True, data_types=["verbs"], output_dir=Path("test")) + get_data(all_bool=True, data_types=["verbs"]) mock_parse.assert_called_once_with( languages=["all"], data_types=["verbs"], wikidata_dump_type=["form"], - output_dir=Path("test"), wikidata_dump_path=None, overwrite_all=False, ) @patch("scribe_data.cli.get.query_data") - @patch("scribe_data.cli.get.questionary.confirm") - def test_get_all_languages_for_data_type_user_says_yes( + @patch("questionary.confirm") + def test_cli_get_all_languages_for_data_type_user_says_yes( self, mock_questionary_confirm: MagicMock, mock_query_data: MagicMock ) -> None: """ @@ -445,19 +397,16 @@ def test_get_all_languages_for_data_type_user_says_yes( # Mock user choosing to query Wikidata directly. mock_questionary_confirm.return_value.ask.return_value = True - get_data(all_bool=True, data_types=["verbs"], output_dir=Path("test")) + get_data(all_bool=True, data_types=["verbs"]) mock_query_data.assert_called_once_with( - languages=["all"], - data_types=["verbs"], - output_dir=Path("test"), - overwrite=False, + languages=["all"], data_types=["verbs"], overwrite=False ) # MARK: Error Handling @patch("scribe_data.cli.get.query_data") @patch("scribe_data.cli.get.check_index_exists") - def test_json_decode_error_handling( + def test_cli_get_json_decode_error_handling( self, mock_check_index: MagicMock, mock_query_data: MagicMock ) -> None: """ @@ -470,7 +419,7 @@ def test_json_decode_error_handling( @patch("scribe_data.cli.get.query_data") @patch("scribe_data.cli.get.check_index_exists") - def test_http_error_handling( + def test_cli_get_http_error_handling( self, mock_check_index: MagicMock, mock_query_data: MagicMock ) -> None: """ @@ -485,7 +434,7 @@ def test_http_error_handling( @patch("scribe_data.cli.get.query_data") @patch("scribe_data.cli.get.check_index_exists") - def test_endpoint_error_handling( + def test_cli_get_endpoint_error_handling( self, mock_check_index: MagicMock, mock_query_data: MagicMock ) -> None: """ @@ -503,7 +452,7 @@ def test_endpoint_error_handling( @patch("scribe_data.cli.get.Path.exists") @patch("scribe_data.cli.get.os.remove") @patch("scribe_data.cli.get.check_index_exists") - def test_output_type_conversion( + def test_cli_get_output_type_conversion( self, mock_check_index: MagicMock, mock_remove: MagicMock, @@ -521,7 +470,6 @@ def test_output_type_conversion( languages=["German"], data_types=["verbs"], output_type="csv", - output_dir=Path("test_dir"), identifier_case="snake", ) @@ -532,7 +480,6 @@ def test_output_type_conversion( languages=["German"], data_types=["verbs"], input_path=expected_input_file, - output_dir=Path("test_dir"), output_type="csv", overwrite=False, identifier_case="snake", @@ -542,7 +489,7 @@ def test_output_type_conversion( # MARK: Default Output Directory @patch("scribe_data.cli.get.check_index_exists") - def test_default_output_directory_selection( + def test_cli_get_default_output_directory_selection( self, mock_check_index: MagicMock ) -> None: """ @@ -564,14 +511,13 @@ def test_default_output_directory_selection( mock_query.assert_called_with( languages=["German"], data_types=["verbs"], - output_dir=expected_dir, overwrite=False, interactive=False, ) @patch("scribe_data.cli.get.query_data") @patch("scribe_data.cli.get.check_index_exists") - def test_get_data_with_interactive_mode( + def test_cli_get_data_with_interactive_mode( self, mock_check_exists: MagicMock, mock_query_data: MagicMock ) -> None: """ @@ -583,13 +529,12 @@ def test_get_data_with_interactive_mode( mock_query_data.assert_called_once_with( languages=["English"], data_types=["nouns"], - output_dir=DEFAULT_JSON_EXPORT_DIR, overwrite=False, interactive=True, ) @patch("scribe_data.cli.get.parse_wd_lexeme_dump") - def test_get_data_with_custom_dump_path(self, mock_parse: MagicMock) -> None: + def test_cli_get_data_with_custom_dump_path(self, mock_parse: MagicMock) -> None: """ Test retrieving data with a custom Wikidata dump path. """ @@ -601,13 +546,14 @@ def test_get_data_with_custom_dump_path(self, mock_parse: MagicMock) -> None: languages=["English"], wikidata_dump_type=["form"], data_types=["nouns"], - output_dir=DEFAULT_JSON_EXPORT_DIR, wikidata_dump_path=custom_path, overwrite_all=False, ) @patch("scribe_data.cli.get.query_data") - def test_get_data_with_multiple_languages(self, mock_query_data: MagicMock) -> None: + def test_cli_get_data_with_multiple_languages( + self, mock_query_data: MagicMock + ) -> None: """ Test retrieving data for multiple languages. """ @@ -624,13 +570,14 @@ def test_get_data_with_multiple_languages(self, mock_query_data: MagicMock) -> N mock_query_data.assert_called_once_with( languages=["English"], # only first language is used data_types=["nouns"], - output_dir=DEFAULT_JSON_EXPORT_DIR, overwrite=False, interactive=False, ) @patch("scribe_data.cli.get.query_data") - def test_error_handling_value_error(self, mock_query_data: MagicMock) -> None: + def test_cli_get_error_handling_value_error( + self, mock_query_data: MagicMock + ) -> None: """ Test handling of ValueError during data retrieval. """ @@ -640,8 +587,8 @@ def test_error_handling_value_error(self, mock_query_data: MagicMock) -> None: get_data(languages=["Invalid"], data_types=["nouns"]) @patch("scribe_data.cli.get.parse_wd_lexeme_dump") - @patch("scribe_data.cli.get.questionary.confirm") - def test_get_data_with_all_and_specific_type( + @patch("questionary.confirm") + def test_cli_get_data_with_all_and_specific_type( self, mock_questionary: MagicMock, mock_parse: MagicMock ) -> None: """ @@ -654,14 +601,13 @@ def test_get_data_with_all_and_specific_type( languages=["all"], wikidata_dump_type=["form"], data_types=["nouns"], - output_dir=DEFAULT_JSON_EXPORT_DIR, wikidata_dump_path=None, overwrite_all=False, ) @patch("scribe_data.cli.get.query_data") @patch("scribe_data.cli.get.check_index_exists") - def test_get_data_case_insensitive_type( + def test_cli_get_data_case_insensitive_type( self, mock_check_exists: MagicMock, mock_query_data: MagicMock ) -> None: """ @@ -673,7 +619,6 @@ def test_get_data_case_insensitive_type( mock_query_data.assert_called_once_with( languages=["English"], data_types=["NOUNS"], - output_dir=DEFAULT_JSON_EXPORT_DIR, overwrite=False, interactive=False, ) diff --git a/tests/cli/test_upgrade.py b/tests/cli/test_cli_upgrade.py similarity index 94% rename from tests/cli/test_upgrade.py rename to tests/cli/test_cli_upgrade.py index 836608e94..6f1ce2849 100644 --- a/tests/cli/test_upgrade.py +++ b/tests/cli/test_cli_upgrade.py @@ -11,7 +11,7 @@ from scribe_data.cli.version import UNKNOWN_VERSION_NOT_FETCHED -class TestUpgradeCLI: +class TestCLIUpgrade: """ Test cases for the upgrade_cli function. """ @@ -19,7 +19,7 @@ class TestUpgradeCLI: @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_unable_to_fetch_latest_version( + def test_cli_upgrade_unable_to_fetch_latest_version( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -40,7 +40,7 @@ def test_upgrade_cli_unable_to_fetch_latest_version( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_already_latest_version( + def test_cli_upgrade_already_latest_version( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -61,7 +61,7 @@ def test_upgrade_cli_already_latest_version( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_suggest_latest_version( + def test_cli_upgrade_suggest_latest_version( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -86,7 +86,7 @@ def test_upgrade_cli_suggest_latest_version( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_successful_upgrade( + def test_cli_upgrade_successful_upgrade( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -117,7 +117,7 @@ def test_upgrade_cli_successful_upgrade( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_subprocess_error( + def test_cli_upgrade_subprocess_error( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -149,7 +149,7 @@ def test_upgrade_cli_subprocess_error( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_version_parsing_edge_cases( + def test_cli_upgrade_version_parsing_edge_cases( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -170,7 +170,7 @@ def test_upgrade_cli_version_parsing_edge_cases( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_string_comparison_edge_case( + def test_cli_upgrade_string_comparison_edge_case( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -197,7 +197,7 @@ def test_upgrade_cli_string_comparison_edge_case( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_proper_higher_version_scenario( + def test_cli_upgrade_proper_higher_version_scenario( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -221,7 +221,7 @@ def test_upgrade_cli_proper_higher_version_scenario( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_different_version_formats( + def test_cli_upgrade_different_version_formats( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -242,7 +242,7 @@ def test_upgrade_cli_different_version_formats( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_semantic_version_upgrade_needed( + def test_cli_upgrade_semantic_version_upgrade_needed( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -269,7 +269,7 @@ def test_upgrade_cli_semantic_version_upgrade_needed( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_with_empty_version_strings( + def test_cli_upgrade_with_empty_version_strings( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -299,7 +299,7 @@ def test_upgrade_cli_with_empty_version_strings( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_invalid_local_version( + def test_cli_upgrade_invalid_local_version( self, mock_print: MagicMock, mock_get_latest: MagicMock, @@ -326,7 +326,7 @@ def test_upgrade_cli_invalid_local_version( @patch("scribe_data.cli.upgrade.get_local_version") @patch("scribe_data.cli.upgrade.get_latest_version") @patch("builtins.print") - def test_upgrade_cli_invalid_latest_version( + def test_cli_upgrade_invalid_latest_version( self, mock_print: MagicMock, mock_get_latest: MagicMock, diff --git a/tests/cli/test_utils.py b/tests/cli/test_cli_utils.py similarity index 65% rename from tests/cli/test_utils.py rename to tests/cli/test_cli_utils.py index 92cb8eee1..e9ab92abf 100644 --- a/tests/cli/test_utils.py +++ b/tests/cli/test_cli_utils.py @@ -3,14 +3,27 @@ Tests for the CLI utils functionality. """ +import sys import unittest +from pathlib import Path from unittest.mock import MagicMock, patch +import pytest + from scribe_data.cli.cli_utils import ( correct_data_type, print_formatted_data, validate_language_and_data_type, ) +from scribe_data.utils import ( + format_sublanguage_name, + get_language_from_iso, + get_language_iso, + get_language_qid, + list_all_languages, +) + +sys.path.append(Path(__file__).parent.parent.parent) # MARK: Utils @@ -128,7 +141,7 @@ def mock_get_qid(self, input_value: str) -> str | None: """ return self.qid_mapping.get(input_value.lower()) - @patch("scribe_data.cli.total.get_qid_by_input") + @patch("scribe_data.cli.total.query.get_qid_by_input") def test_validate_language_and_data_type_valid( self, mock_get_qid: MagicMock ) -> None: @@ -143,7 +156,7 @@ def test_validate_language_and_data_type_valid( except ValueError: self.fail("validate_language_and_data_type raised ValueError unexpectedly!") - @patch("scribe_data.cli.total.get_qid_by_input") + @patch("scribe_data.cli.total.query.get_qid_by_input") def test_validate_language_and_data_type_invalid_language( self, mock_get_qid: MagicMock ) -> None: @@ -159,7 +172,7 @@ def test_validate_language_and_data_type_invalid_language( self.assertEqual(str(context.exception), "Invalid language 'InvalidLanguage'.") - @patch("scribe_data.cli.total.get_qid_by_input") + @patch("scribe_data.cli.total.query.get_qid_by_input") def test_validate_language_and_data_type_invalid_data_type( self, mock_get_qid: MagicMock ) -> None: @@ -175,7 +188,7 @@ def test_validate_language_and_data_type_invalid_data_type( self.assertEqual(str(context.exception), "Invalid data-type 'InvalidDataType'.") - @patch("scribe_data.cli.total.get_qid_by_input") + @patch("scribe_data.cli.total.query.get_qid_by_input") def test_validate_language_and_data_type_both_invalid( self, mock_get_qid: MagicMock ) -> None: @@ -230,3 +243,170 @@ def test_validate_language_and_data_type_mixed_validity_in_lists(self) -> None: validate_language_and_data_type(languages, data_types) self.assertIn("Invalid language 'InvalidLanguage'", str(context.exception)) self.assertIn("Invalid data-type 'InvalidDataType'", str(context.exception)) + + +# MARK: Languages + + +@pytest.mark.parametrize( + "language, qid_code", + [ + ("english", "Q1860"), + ("french", "Q150"), + ("german", "Q188"), + ("italian", "Q652"), + ("portuguese", "Q5146"), + ("russian", "Q7737"), + ("spanish", "Q1321"), + ("swedish", "Q9027"), + ("bokmål", "Q25167"), + ], +) +def test_get_language_qid_positive(language: str, qid_code: str) -> None: + assert get_language_qid(language) == qid_code + + +def test_get_language_qid_negative() -> None: + with pytest.raises(ValueError) as excp: + _ = get_language_qid("Newspeak") + + assert ( + str(excp.value) + == "Newspeak is currently not a supported language for QID conversion." + ) + + +@pytest.mark.parametrize( + "language, iso_code", + [ + ("english", "en"), + ("french", "fr"), + ("german", "de"), + ("italian", "it"), + ("portuguese", "pt"), + ("russian", "ru"), + ("spanish", "es"), + ("swedish", "sv"), + ("bokmål", "nb"), + ], +) +def test_get_language_iso_positive(language: str, iso_code: str) -> None: + assert get_language_iso(language) == iso_code + + +def test_get_language_iso_negative() -> None: + with pytest.raises(ValueError) as excp: + _ = get_language_iso("Gibberish") + + assert ( + str(excp.value) + == "Gibberish is currently not a supported language for ISO conversion." + ) + + +@pytest.mark.parametrize( + "iso_code, language", + [ + ("en", "English"), + ("fr", "French"), + ("de", "German"), + ("it", "Italian"), + ("pt", "Portuguese"), + ("ru", "Russian"), + ("es", "Spanish"), + ("sv", "Swedish"), + ("nb", "Bokmål"), + ], +) +def test_get_language_from_iso_positive(iso_code: str, language: str) -> None: + assert get_language_from_iso(iso_code) == language + + +def test_get_language_from_iso_negative() -> None: + with pytest.raises(ValueError) as excp: + _ = get_language_from_iso("ixi") + + assert str(excp.value) == "IXI is currently not a supported ISO language." + + +@pytest.mark.parametrize( + "lang, expected_output", + [ + ("nynorsk", "nynorsk norwegian"), + ("bokmål", "bokmål norwegian"), + ("english", "english"), + ], +) +def test_format_sublanguage_name_positive(lang: str, expected_output: str) -> None: + assert format_sublanguage_name(lang) == expected_output + + +@pytest.mark.parametrize( + "lang, expected_output", + [ + ("Q42", "Q42"), # test that any QID is returned + ("Q1860", "Q1860"), + ], +) +def test_format_sublanguage_name_qid_positive(lang: str, expected_output: str) -> None: + assert format_sublanguage_name(lang) == expected_output + + +def test_format_sublanguage_name_negative() -> None: + with pytest.raises(ValueError) as excp: + _ = format_sublanguage_name("Newspeak") + + assert str(excp.value) == "Newspeak is not a valid language or sub-language." + + +def test_list_all_languages() -> None: + expected_languages = [ + "arabic", + "basque", + "bengali", + "bokmål norwegian", + "czech", + "dagbani", + "danish", + "english", + "esperanto", + "estonian", + "finnish", + "french", + "german", + "greek", + "gurmukhi punjabi", + "hausa", + "hebrew", + "hindi hindustani", + "igbo", + "indonesian", + "italian", + "japanese", + "korean", + "kurmanji", + "latin", + "latvian", + "malay", + "malayalam", + "mandarin chinese", + "nigerian pidgin", + "northern sami", + "nynorsk norwegian", + "persian", + "polish", + "portuguese", + "russian", + "shahmukhi punjabi", + "slovak", + "spanish", + "swahili", + "swedish", + "tajik", + "tamil", + "ukrainian", + "urdu hindustani", + "yoruba", + ] + + assert list_all_languages() == expected_languages diff --git a/tests/cli/test_version.py b/tests/cli/test_cli_version.py similarity index 87% rename from tests/cli/test_version.py rename to tests/cli/test_cli_version.py index 5f7083f4d..d1ed1076b 100644 --- a/tests/cli/test_version.py +++ b/tests/cli/test_cli_version.py @@ -18,7 +18,7 @@ class TestVersionFunctions(unittest.TestCase): @patch("scribe_data.cli.version.importlib.metadata.version") - def test_get_local_version_installed(self, mock_version: MagicMock) -> None: + def test_cli_version_get_local_installed(self, mock_version: MagicMock) -> None: mock_version.return_value = "1.0.0" self.assertEqual(get_local_version(), "1.0.0") @@ -26,24 +26,24 @@ def test_get_local_version_installed(self, mock_version: MagicMock) -> None: "scribe_data.cli.version.importlib.metadata.version", side_effect=importlib.metadata.PackageNotFoundError, ) - def test_get_local_version_not_installed(self, mock_version: MagicMock) -> None: + def test_cli_version_local_not_installed(self, mock_version: MagicMock) -> None: self.assertEqual(get_local_version(), UNKNOWN_VERSION_NOT_PIP) @patch("requests.get") - def test_get_latest_version(self, mock_get: MagicMock) -> None: + def test_cli_version_latest(self, mock_get: MagicMock) -> None: mock_get.return_value.status_code = 200 mock_get.return_value.json.return_value = {"name": "v1.0.1"} self.assertEqual(get_latest_version(), "v1.0.1") @patch("requests.get", side_effect=Exception("Unable to fetch version")) - def test_get_latest_version_failure(self, mock_get: MagicMock) -> None: + def test_cli_version_latest_failure(self, mock_get: MagicMock) -> None: self.assertEqual(get_latest_version(), UNKNOWN_VERSION_NOT_FETCHED) @patch("scribe_data.cli.version.get_local_version", return_value="X.Y.Z") @patch( "scribe_data.cli.version.get_latest_version", return_value="Scribe-Data X.Y.Z" ) - def test_get_version_message_up_to_date( + def test_cli_version_message_up_to_date( self, mock_latest_version: MagicMock, mock_local_version: MagicMock ) -> None: """ @@ -56,7 +56,7 @@ def test_get_version_message_up_to_date( @patch( "scribe_data.cli.version.get_latest_version", return_value="Scribe-Data X.Y.Z" ) - def test_upgrade_available( + def test_cli_version_upgrade_available( self, mock_latest_version: MagicMock, mock_local_version: MagicMock ) -> None: """ @@ -72,7 +72,7 @@ def test_upgrade_available( @patch( "scribe_data.cli.version.get_latest_version", return_value="Scribe-Data X.Y.Z" ) - def test_local_version_unknown( + def test_cli_version_local_unknown( self, mock_latest_version: MagicMock, mock_local_version: MagicMock ) -> None: """ @@ -85,7 +85,7 @@ def test_local_version_unknown( "scribe_data.cli.version.get_latest_version", return_value=UNKNOWN_VERSION_NOT_FETCHED, ) - def test_latest_version_unknown( + def test_cli_version_latest_unknown( self, mock_latest_version: MagicMock, mock_local_version: MagicMock ) -> None: """ diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py deleted file mode 100644 index 6df2b2702..000000000 --- a/tests/cli/test_convert.py +++ /dev/null @@ -1,500 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later -""" -Tests for the CLI convert functionality. -""" - -import json -import unittest -from io import StringIO -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest - -from scribe_data.cli.convert import ( - convert_to_csv_or_tsv, - convert_to_json, - convert_wrapper, -) - - -class TestConvert(unittest.TestCase): - # MARK: Helper Functions - - def normalize_line_endings(self, data: str) -> str: - """ - Normalize line endings in a given string. - - - Parameters - ---------- - data: str - The input string whose line endings are to be normalized. - - Returns - --------- - data: str - The input string with normalized line endings. - """ - return data.replace("\r\n", "\n").replace("\r", "\n") - - @pytest.fixture(autouse=True) - def _setup_fixtures(self, tmp_path): - self.tmp_path = tmp_path - - # MARK: JSON - - @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_json_empty_language(self, mock_path: MagicMock) -> None: - csv_data = "key,value\na,1\nb,2" - mock_file = StringIO(csv_data) - - mock_path_obj = MagicMock(spec=Path) - mock_path.return_value = mock_path_obj - mock_path_obj.suffix = ".csv" - mock_path_obj.exists.return_value = True - mock_path_obj.open.return_value.__enter__.return_value = mock_file - - with self.assertRaises(ValueError) as context: - convert_to_json( - language="", - data_types="nouns", - input_file=Path("input.csv"), - output_dir=Path("/output_dir"), - output_type="json", - overwrite=True, - ) - self.assertIn("Language '' is not recognized.", str(context.exception)) - - @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_json_supported_file_extension_csv( - self, mock_path_class: MagicMock - ) -> None: - mock_path_instance = MagicMock(spec=Path) - - mock_path_class.return_value = mock_path_instance - - mock_path_instance.suffix = ".csv" - mock_path_instance.exists.return_value = True - - convert_to_json( - language="English", - data_types="nouns", - input_file=Path("test.csv"), - output_dir=Path("/output_dir"), - output_type="json", - overwrite=True, - ) - - @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_json_supported_file_extension_tsv( - self, mock_path_class: MagicMock - ) -> None: - mock_path_instance = MagicMock(spec=Path) - - mock_path_class.return_value = mock_path_instance - - mock_path_instance.suffix = ".tsv" - mock_path_instance.exists.return_value = True - - convert_to_json( - language="English", - data_types="nouns", - input_file=Path("test.tsv"), - output_dir=Path("/output_dir"), - output_type="json", - overwrite=True, - ) - - def test_convert_to_json_unsupported_file_extension(self) -> None: - input_file = self.tmp_path / "test.txt" - input_file.write_text("Hello, world!", encoding="utf-8") - output_dir = self.tmp_path / "output" - output_dir.mkdir(parents=True, exist_ok=True) - - with self.assertRaises(ValueError) as context: - convert_to_json( - language="English", - data_types="nouns", - input_file=input_file, - output_dir=output_dir, - output_type="json", - overwrite=True, - ) - - self.assertIn("Unsupported file extension", str(context.exception)) - self.assertEqual( - str(context.exception), - f"Unsupported file extension '.txt' for {input_file}. Please provide a '.csv' or '.tsv' file.", - ) - - # MARK: JSON - - def test_convert_to_json_standard_csv(self) -> None: - csv_data = "key,value\na,1\nb,2" - expected_json_output = {"a": "1", "b": "2"} - - input_file = self.tmp_path / "test.csv" - input_file.write_text(csv_data, encoding="utf-8") - output_dir = self.tmp_path / "output" - output_dir.mkdir(parents=True, exist_ok=True) - - convert_to_json( - language="English", - data_types="nouns", - input_file=input_file, - output_dir=output_dir, - output_type="json", - overwrite=True, - ) - - output_file = output_dir / "English" / "nouns.json" - with open(output_file, "r", encoding="utf-8") as f: - actual_content = json.load(f) - - assert actual_content == expected_json_output - - def test_convert_to_json_with_multiple_keys(self) -> None: - csv_data = "key,value1,value2\na,1,x\nb,2,y\nc,3,z" - expected_json_output = { - "a": {"value1": "1", "value2": "x"}, - "b": {"value1": "2", "value2": "y"}, - "c": {"value1": "3", "value2": "z"}, - } - - input_file = self.tmp_path / "test.csv" - input_file.write_text(csv_data, encoding="utf-8") - output_dir = self.tmp_path / "output" - output_dir.mkdir(parents=True, exist_ok=True) - - convert_to_json( - language="English", - data_types="nouns", - input_file=input_file, - output_dir=output_dir, - output_type="json", - overwrite=True, - ) - - output_file = output_dir / "English" / "nouns.json" - with open(output_file, "r", encoding="utf-8") as f: - actual_content = json.load(f) - - assert actual_content == expected_json_output - - def test_convert_to_json_with_complex_structure(self) -> None: - csv_data = "key,emoji,is_base,rank\na,😀,true,1\nb,😅,false,2" - expected_json_output = { - "a": [{"emoji": "😀", "is_base": True, "rank": 1}], - "b": [{"emoji": "😅", "is_base": False, "rank": 2}], - } - - input_file = self.tmp_path / "test.csv" - input_file.write_text(csv_data, encoding="utf-8") - output_dir = self.tmp_path / "output" - output_dir.mkdir(parents=True, exist_ok=True) - - convert_to_json( - language="English", - data_types="nouns", - input_file=input_file, - output_dir=output_dir, - output_type="json", - overwrite=True, - ) - - output_file = output_dir / "English" / "nouns.json" - with open(output_file, "r", encoding="utf-8") as f: - actual_content = json.load(f) - - assert actual_content == expected_json_output - - # MARK: CSV or TSV - - def test_convert_to_csv_or_json_empty_language(self) -> None: - json_data = '{"key1": "value1", "key2": "value2"}' - - input_file = self.tmp_path / "test.json" - input_file.write_text(json_data, encoding="utf-8") - output_dir = self.tmp_path / "output" - output_dir.mkdir(parents=True, exist_ok=True) - - with self.assertRaises(ValueError) as context: - convert_to_csv_or_tsv( - language="", - data_types="nouns", - input_file=input_file, - output_dir=output_dir, - output_type="csv", - overwrite=True, - ) - - self.assertEqual(str(context.exception), "Language '' is not recognized.") - - def test_convert_to_csv_or_tsv_standard_dict_to_csv(self) -> None: - json_data = '{"a": "1", "b": "2"}' - expected_csv_output = "preposition,value\na,1\nb,2\n" - - input_file = self.tmp_path / "test.json" - input_file.write_text(json_data, encoding="utf-8") - output_dir = self.tmp_path / "output" - output_dir.mkdir(parents=True, exist_ok=True) - - convert_to_csv_or_tsv( - language="English", - data_types="prepositions", - input_file=input_file, - output_dir=output_dir, - output_type="csv", - overwrite=True, - ) - - output_file = output_dir / "English" / "prepositions.csv" - actual_content = output_file.read_text(encoding="utf-8") - assert actual_content == expected_csv_output - - def test_convert_to_csv_or_tsv_standard_dict_to_tsv(self) -> None: - json_data = '{"a": "1", "b": "2"}' - expected_tsv_output = "preposition\tvalue\na\t1\nb\t2\n" - - input_file = self.tmp_path / "test.json" - input_file.write_text(json_data, encoding="utf-8") - output_dir = self.tmp_path / "output" - output_dir.mkdir(parents=True, exist_ok=True) - - convert_to_csv_or_tsv( - language="English", - data_types="prepositions", - input_file=input_file, - output_dir=output_dir, - output_type="tsv", - overwrite=True, - ) - - output_file = output_dir / "English" / "prepositions.tsv" - actual_content = output_file.read_text(encoding="utf-8") - assert actual_content == expected_tsv_output - - def test_convert_to_csv_or_tsv_nested_dict_to_csv(self) -> None: - json_data = ( - '{"a": {"value1": "1", "value2": "x"}, "b": {"value1": "2", "value2": "y"}}' - ) - expected_csv_output = "noun,value1,value2\na,1,x\nb,2,y\n" - - input_file = self.tmp_path / "test.json" - input_file.write_text(json_data, encoding="utf-8") - output_dir = self.tmp_path / "output" - output_dir.mkdir(parents=True, exist_ok=True) - - convert_to_csv_or_tsv( - language="English", - data_types="nouns", - input_file=input_file, - output_dir=output_dir, - output_type="csv", - overwrite=True, - ) - - output_file = output_dir / "English" / "nouns.csv" - actual_content = output_file.read_text(encoding="utf-8") - assert actual_content == expected_csv_output - - def test_convert_to_csv_or_tsv_nested_dict_to_tsv(self) -> None: - json_data = ( - '{"a": {"value1": "1", "value2": "x"}, "b": {"value1": "2", "value2": "y"}}' - ) - expected_tsv_output = "noun\tvalue1\tvalue2\na\t1\tx\nb\t2\ty\n" - - input_file = self.tmp_path / "test.json" - input_file.write_text(json_data, encoding="utf-8") - output_dir = self.tmp_path / "output" - output_dir.mkdir(parents=True, exist_ok=True) - - convert_to_csv_or_tsv( - language="English", - data_types="nouns", - input_file=input_file, - output_dir=output_dir, - output_type="tsv", - overwrite=True, - ) - - output_file = output_dir / "English" / "nouns.tsv" - actual_content = output_file.read_text(encoding="utf-8") - assert actual_content == expected_tsv_output - - def test_convert_to_csv_or_tsv_list_of_dicts_to_csv(self) -> None: - json_data = '{"a": [{"emoji": "😀", "is_base": true, "rank": 1}, {"emoji": "😅", "is_base": false, "rank": 2}]}' - expected_csv_output = "word,emoji,is_base,rank\na,😀,True,1\na,😅,False,2\n" - - input_file = self.tmp_path / "test.json" - input_file.write_text(json_data, encoding="utf-8") - output_dir = self.tmp_path / "output" - output_dir.mkdir(parents=True, exist_ok=True) - - convert_to_csv_or_tsv( - language="English", - data_types="emoji-keywords", - input_file=input_file, - output_dir=output_dir, - output_type="csv", - overwrite=True, - ) - - output_file = output_dir / "English" / "emoji-keywords.csv" - actual_content = output_file.read_text(encoding="utf-8") - assert actual_content == expected_csv_output - - def test_convert_to_csv_or_tsv_list_of_dicts_to_tsv(self) -> None: - json_data = '{"a": [{"emoji": "😀", "is_base": true, "rank": 1}, {"emoji": "😅", "is_base": false, "rank": 2}]}' - expected_tsv_output = ( - "word\temoji\tis_base\trank\na\t😀\tTrue\t1\na\t😅\tFalse\t2\n" - ) - - input_file = self.tmp_path / "test.json" - input_file.write_text(json_data, encoding="utf-8") - output_dir = self.tmp_path / "output" - output_dir.mkdir(parents=True, exist_ok=True) - - convert_to_csv_or_tsv( - language="English", - data_types="emoji-keywords", - input_file=input_file, - output_dir=output_dir, - output_type="tsv", - overwrite=True, - ) - - output_file = output_dir / "English" / "emoji-keywords.tsv" - actual_content = output_file.read_text(encoding="utf-8") - assert actual_content == expected_tsv_output - - # MARK: SQLITE - - @patch("scribe_data.cli.convert.Path", autospec=True) - @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) - @patch("shutil.copy") - def test_convert_to_sqlite( - self, - mock_shutil_copy: MagicMock, - mock_data_to_sqlite: MagicMock, - mock_path: MagicMock, - ) -> None: - mock_path.return_value.exists.return_value = True - - convert_wrapper( - languages=["english"], - data_types=["nouns"], - input_path=Path("file"), - output_dir=Path("/output"), - output_type="sqlite", - overwrite=True, - identifier_case="camel", - ) - - mock_data_to_sqlite.assert_called_with( - languages=["english"], - specific_tables=["nouns"], - identifier_case="camel", - input_file=Path("file"), - output_file=Path("/output"), - overwrite=True, - ) - - @patch("scribe_data.cli.convert.Path", autospec=True) - @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) - def test_convert_to_sqlite_no_output_dir( - self, mock_data_to_sqlite: MagicMock, mock_path: MagicMock - ) -> None: - mock_input_file = MagicMock() - mock_input_file.exists.return_value = True - - mock_path.return_value = mock_input_file - - mock_input_file.parent = MagicMock() - mock_input_file.parent.__truediv__.return_value = MagicMock() - mock_input_file.parent.__truediv__.return_value.exists.return_value = False - - convert_wrapper( - languages=["english"], - data_types=["nouns"], - input_path=Path(mock_input_file), - output_dir=None, - output_type="sqlite", - overwrite=True, - identifier_case="camel", - ) - - mock_data_to_sqlite.assert_called_with( - languages=["english"], - specific_tables=["nouns"], - identifier_case="camel", - input_file=Path(mock_input_file), - output_file=Path("scribe_data_sqlite_export"), - overwrite=True, - ) - - @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) - def test_convert_wrapper_german_wiktionary_translations_sqlite( - self, mock_data_to_sqlite: MagicMock - ) -> None: - convert_wrapper( - languages=["german"], - data_types=["wiktionary_translations"], - input_path=Path("/input"), - output_dir=Path("/output"), - output_type="sqlite", - overwrite=False, - identifier_case="camel", - ) - - mock_data_to_sqlite.assert_called_once_with( - languages=["german"], - specific_tables=["wiktionary_translations"], - identifier_case="camel", - input_file=Path("/input"), - output_file=Path("/output"), - overwrite=False, - ) - - @patch( - "scribe_data.cli.convert.DEFAULT_WIKTIONARY_JSON_EXPORT_DIR", - new=Path("/mock_wiktionary_dir"), - ) - @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) - def test_convert_wrapper_wiktionary_no_input_path_uses_wiktionary_default( - self, mock_data_to_sqlite: MagicMock - ) -> None: - convert_wrapper( - languages=["german"], - data_types=["wiktionary_translations"], - input_path=None, - output_dir=Path("/output"), - output_type="sqlite", - overwrite=False, - ) - - mock_data_to_sqlite.assert_called_once_with( - languages=["german"], - specific_tables=["wiktionary_translations"], - identifier_case="camel", - input_file=Path("/mock_wiktionary_dir"), - output_file=Path("/output"), - overwrite=False, - ) - - def test_convert(self) -> None: - with self.assertRaises(ValueError) as context: - convert_wrapper( - languages=["English"], - data_types=["nouns"], - input_path=Path("Data/ecode.csv"), - output_dir=Path("/output_dir"), - output_type="parquet", - overwrite=True, - ) - - self.assertEqual( - str(context.exception), - "Unsupported output type 'parquet'. Must be 'json', 'csv', 'tsv' or 'sqlite'.", - ) diff --git a/tests/cli/test_interactive.py b/tests/cli/test_interactive.py deleted file mode 100644 index 8f35171bf..000000000 --- a/tests/cli/test_interactive.py +++ /dev/null @@ -1,260 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later -""" -Tests for the CLI interactive mode functionality. -""" - -import tempfile -import unittest -from pathlib import Path -from unittest.mock import MagicMock, call, patch - -from prompt_toolkit.completion import WordCompleter - -from scribe_data.cli.interactive import ( - ScribeDataConfig, - configure_settings, - display_summary, - prompt_for_data_types, - prompt_for_languages, - run_request, -) - - -class TestScribeDataInteractive(unittest.TestCase): - def setUp(self) -> None: - """ - Set up test fixtures before each test method. - """ - self.config = ScribeDataConfig() - # Mock the language_metadata and data_type_metadata. - self.config.languages = ["english", "spanish", "french"] - self.config.data_types = ["nouns", "verbs"] - - def test_scribe_data_config_initialization(self) -> None: - """ - Test ScribeDataConfig initialization. - """ - self.assertEqual(self.config.selected_languages, []) - self.assertEqual(self.config.selected_data_types, []) - self.assertEqual(self.config.output_type, "json") - self.assertIsInstance(self.config.output_dir, Path) - self.assertFalse(self.config.overwrite) - self.assertFalse(self.config.configured) - - @patch("scribe_data.cli.interactive.prompt") - @patch("scribe_data.cli.interactive.rprint") - def test_configure_settings_all_languages( - self, mock_rprint: MagicMock, mock_prompt: MagicMock - ) -> None: - """ - Test configure_settings with 'All' languages selection. - """ - # Set up mock responses. - responses = iter( - [ - "All", # languages - "nouns", # data types - "json", # output type - "", # output directory (default) - "y", # overwrite - ] - ) - mock_prompt.side_effect = lambda *args, **kwargs: next(responses) - - with patch("scribe_data.cli.interactive.config", self.config): - with patch("scribe_data.cli.interactive.display_summary"): - configure_settings() - - self.assertEqual(self.config.selected_languages, self.config.languages) - self.assertEqual(self.config.selected_data_types, ["nouns"]) - self.assertEqual(self.config.output_type, "json") - self.assertTrue(self.config.configured) - - @patch("scribe_data.cli.interactive.prompt") - @patch("scribe_data.cli.interactive.rprint") - def test_configure_settings_specific_languages( - self, mock_rprint: MagicMock, mock_prompt: MagicMock - ) -> None: - """ - Test configure_settings with specific language selection. - """ - # Set up mock responses. - responses = iter( - [ - "english, spanish", # languages - "nouns, verbs", # data types - "csv", # output type - "/custom/path", # output directory - "n", # overwrite - ] - ) - mock_prompt.side_effect = lambda *args, **kwargs: next(responses) - - with patch("scribe_data.cli.interactive.config", self.config): - with patch("scribe_data.cli.interactive.display_summary"): - configure_settings() - - self.assertEqual(self.config.selected_languages, ["english", "spanish"]) - self.assertEqual(self.config.selected_data_types, ["nouns", "verbs"]) - self.assertEqual(self.config.output_type, "csv") - self.assertEqual(self.config.output_dir.as_posix(), "/custom/path") - self.assertFalse(self.config.overwrite) - - @patch("scribe_data.cli.interactive.get_data") - @patch("scribe_data.cli.interactive.tqdm") - @patch("scribe_data.cli.interactive.logger") - def test_run_request( - self, mock_logger: MagicMock, mock_tqdm: MagicMock, mock_get_data: MagicMock - ) -> None: - """ - Test run_request functionality. - """ - self.config.selected_languages = ["english"] - self.config.selected_data_types = ["nouns"] - self.config.configured = True - - mock_get_data.return_value = True - mock_progress = MagicMock() - mock_tqdm.return_value.__enter__.return_value = mock_progress - - with patch("scribe_data.cli.interactive.config", self.config): - run_request() - - mock_get_data.assert_called_once_with( - languages=["english"], - data_types=["nouns"], - output_type=self.config.output_type, - output_dir=self.config.output_dir, - overwrite=self.config.overwrite, - interactive=True, - ) - - @patch("scribe_data.cli.interactive.prompt") - @patch("scribe_data.cli.interactive.rprint") - def test_request_total_lexeme( - self, mock_rprint: MagicMock, mock_prompt: MagicMock - ) -> None: - """ - Test request_total_lexeme functionality. - """ - # Set up mock responses. - mock_prompt.side_effect = [ - "english, french", # first call for languages - "nouns", # first call for data types - ] - - with patch("scribe_data.cli.interactive.config", self.config): - with patch( - "scribe_data.cli.interactive.list_all_languages", - return_value=["english", "french"], - ): - prompt_for_languages() - prompt_for_data_types() - - # Verify the config was updated correctly. - self.assertEqual(self.config.selected_languages, ["english", "french"]) - self.assertEqual(self.config.selected_data_types, ["nouns"]) - - # Verify prompt was called with correct arguments. - expected_calls = [ - call( - "Select languages (comma-separated or 'All'): ", - completer=unittest.mock.ANY, - default="", - ), - call( - "Select data types (comma-separated or 'All'): ", - completer=unittest.mock.ANY, - default="", - ), - ] - mock_prompt.assert_has_calls(expected_calls, any_order=False) - - @patch("rich.console.Console.print") - def test_display_summary(self, mock_print: MagicMock) -> None: - """ - Test display_summary functionality. - """ - self.config.selected_languages = ["english"] - self.config.selected_data_types = ["nouns"] - self.config.output_type = "json" - - with patch("scribe_data.cli.interactive.config", self.config): - display_summary() - mock_print.assert_called() - - def test_resolve_wiktionary_dump_path_from_subdirectory(self) -> None: - """ - Find dumps when cwd is not the project root. - """ - from scribe_data.cli.interactive import resolve_wiktionary_dump_path - - with patch("os.getcwd") as mock_getcwd: - with tempfile.TemporaryDirectory() as tmp: - root = Path(tmp) - dump_dir = root / "scribe_data_wiktionary_dumps_export" - json_dir = root / "scribe_data_json_export" - dump_dir.mkdir() - json_dir.mkdir() - dump_file = dump_dir / "dewiktionary-pages-articles.xml.bz2" - dump_file.write_bytes(b"x") - - mock_getcwd.return_value = str(json_dir) - resolved = resolve_wiktionary_dump_path( - "german", - "scribe_data_wiktionary_dumps_export", - ) - - self.assertEqual(resolved, dump_file.resolve()) - - def test_create_word_completer(self) -> None: - """ - Test create_word_completer functionality. - """ - from scribe_data.cli.interactive import create_word_completer - - # Test without 'All' option. - options = ["english", "spanish", "french"] - completer = create_word_completer(options, include_all=False) - self.assertIsInstance(completer, WordCompleter) - self.assertEqual(completer.words, options) - - # Test with 'All' option. - completer_with_all = create_word_completer(options, include_all=True) - self.assertEqual(completer_with_all.words, ["All"] + options) - - @patch( - "scribe_data.cli.interactive.resolve_wiktionary_dump_path", - return_value=Path("/dump/path"), - ) - @patch("scribe_data.wiktionary.parse_translations.parse_wiktionary_translations") - @patch("scribe_data.cli.interactive.prompt") - @patch("scribe_data.cli.interactive.prompt_for_languages") - @patch("scribe_data.cli.interactive.questionary.select") - def test_start_interactive_mode_translations( - self, - mock_select, - mock_prompt_languages, - mock_prompt, - mock_parse_wiktionary, - mock_resolve_dump, - ): - from scribe_data.cli.interactive import config, start_interactive_mode - - mock_select.return_value.ask.side_effect = ["translations"] - mock_prompt.side_effect = [ - "german", - "/dump/path", - "scribe_data_wiktionary_json_export", - "false", - ] - config.selected_languages = ["english"] - - start_interactive_mode(operation="translations") - - mock_parse_wiktionary.assert_called_once_with( - target_languages=["english"], - wiktionary_dump_path=Path("/dump/path"), - output_dir=Path("scribe_data_wiktionary_json_export"), - overwrite=False, - ) diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py deleted file mode 100644 index 5d45b50a4..000000000 --- a/tests/cli/test_list.py +++ /dev/null @@ -1,211 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later -""" -Tests for the CLI list functionality. -""" - -import unittest -from unittest.mock import MagicMock, call, patch - -from scribe_data.cli.list import ( - get_language_iso, - get_language_qid, - list_all, - list_data_types, - list_languages, - list_languages_for_data_type, - list_wrapper, -) -from scribe_data.cli.main import main -from scribe_data.utils import ( - list_all_languages, - list_languages_with_metadata_for_data_type, -) - - -class TestListFunctions(unittest.TestCase): - @patch("builtins.print") - def test_list_languages(self, mock_print: MagicMock) -> None: - list_languages() - - # Verify the headers. - mock_print.assert_any_call("\nLanguage ISO QID ") - mock_print.assert_any_call("=================================") - - # Dynamically get the first language from the metadata. - languages = list_all_languages() - first_language = languages[0] - first_iso = get_language_iso(first_language) - first_qid = get_language_qid(first_language) - - # Verify the first language entry. - # Calculate column widths as in the actual function. - language_col_width = max(len(lang) for lang in languages) + 2 - iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2 - qid_col_width = max(len(get_language_qid(lang)) for lang in languages) + 2 - - # Verify the first language entry with dynamic spacing. - mock_print.assert_any_call( - f"{first_language.capitalize():<{language_col_width}} {first_iso:<{iso_col_width}} {first_qid:<{qid_col_width}}" - ) - # Total print calls: N (languages) + 3 (header, one separator, final line). - self.assertEqual(mock_print.call_count, len(languages) + 3) - - @patch("builtins.print") - def test_list_data_types_all_languages(self, mock_print: MagicMock) -> None: - list_data_types() - print(mock_print.mock_calls) - expected_calls = [ - call(), - call("Available data types: All languages"), - call("==================================="), - call("adjectives"), - call("adverbs"), - # call("articles"), - call("conjunctions"), - call("emoji-keywords"), - call("nouns"), - call("personal-pronouns"), - call("postpositions"), - call("prepositions"), - call("pronouns"), - call("proper-nouns"), - call("verbs"), - call(), - ] - mock_print.assert_has_calls(expected_calls) - - @patch("builtins.print") - def test_list_data_types_specific_language(self, mock_print: MagicMock) -> None: - list_data_types("english") - - expected_calls = [ - call(), - call("Available data types: English"), - call("============================="), - call("adjectives"), - call("adverbs"), - call("emoji-keywords"), - call("nouns"), - call("personal-pronouns"), - call("prepositions"), - call("pronouns"), - call("proper-nouns"), - call("verbs"), - call(), - ] - mock_print.assert_has_calls(expected_calls) - - def test_list_data_types_invalid_language(self) -> None: - with self.assertRaises(ValueError): - list_data_types("InvalidLanguage") - - def test_list_data_types_no_data_types(self) -> None: - with self.assertRaises(ValueError): - list_data_types("Klingon") - - @patch("scribe_data.cli.list.list_languages") - @patch("scribe_data.cli.list.list_data_types") - def test_list_all( - self, mock_list_data_types: MagicMock, mock_list_languages: MagicMock - ) -> None: - list_all() - mock_list_languages.assert_called_once() - mock_list_data_types.assert_called_once() - - @patch("scribe_data.cli.list.list_all") - def test_list_wrapper_all(self, mock_list_all: MagicMock) -> None: - list_wrapper(all_bool=True) - mock_list_all.assert_called_once() - - @patch("scribe_data.cli.list.list_languages") - def test_list_wrapper_languages(self, mock_list_languages: MagicMock) -> None: - list_wrapper(language=True) - mock_list_languages.assert_called_once() - - @patch("scribe_data.cli.list.list_data_types") - def test_list_wrapper_data_types(self, mock_list_data_types: MagicMock) -> None: - list_wrapper(data_type=True) - mock_list_data_types.assert_called_once() - - @patch("builtins.print") - def test_list_wrapper_language_and_data_type(self, mock_print: MagicMock) -> None: - list_wrapper(language=True, data_type=True) - mock_print.assert_called_with( - "Please specify either a language or a data type." - ) - - @patch("scribe_data.cli.list.list_languages_for_data_type") - def test_list_wrapper_languages_for_data_type( - self, mock_list_languages_for_data_type: MagicMock - ) -> None: - list_wrapper(language=True, data_type="example_data_type") - mock_list_languages_for_data_type.assert_called_with("example_data_type") - - @patch("scribe_data.cli.list.list_data_types") - def test_list_wrapper_data_types_for_language( - self, mock_list_data_types: MagicMock - ) -> None: - list_wrapper(language="English", data_type=True) - mock_list_data_types.assert_called_with("English") - - @patch("builtins.print") - def test_list_languages_for_data_type_valid(self, mock_print: MagicMock) -> None: - # Call the function with a specific data type. - list_languages_for_data_type("nouns") - - # Dynamically create the header based on column widths. - all_languages = list_languages_with_metadata_for_data_type() - - # Calculate column widths as in the actual function. - language_col_width = max(len(lang["name"]) for lang in all_languages) + 2 - iso_col_width = max(len(lang["iso"]) for lang in all_languages) + 2 - qid_col_width = max(len(lang["qid"]) for lang in all_languages) + 2 - - # Dynamically generate the expected header string. - expected_header = f"{'\nLanguage':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}" - - # Verify the headers dynamically. - mock_print.assert_any_call(expected_header) - mock_print.assert_any_call( - "=" * (language_col_width + iso_col_width + qid_col_width) - ) - - # Verify the first language entry if there are any languages. - - first_language = all_languages[0]["name"].capitalize() - first_iso = all_languages[0]["iso"] - first_qid = all_languages[0]["qid"] - - # Verify the first language entry with dynamic spacing. - mock_print.assert_any_call( - f"{first_language:<{language_col_width}} {first_iso:<{iso_col_width}} {first_qid:<{qid_col_width}}" - ) - - # Check the total number of calls. - # Total calls = N (languages) + 3 (header, one separator, final line) - expected_calls = len(all_languages) + 3 - self.assertEqual(mock_print.call_count, expected_calls) - - @patch("scribe_data.cli.list.list_languages") - def test_list_languages_command(self, mock_list_languages: MagicMock) -> None: - test_args = ["main.py", "list", "--language"] - with patch("sys.argv", test_args): - main() - - mock_list_languages.assert_called_once() - - @patch("scribe_data.cli.list.list_data_types") - def test_list_data_types_command(self, mock_list_data_types: MagicMock) -> None: - test_args = ["main.py", "list", "--data-type"] - with patch("sys.argv", test_args): - main() - - mock_list_data_types.assert_called_once() - - @patch("scribe_data.cli.list.list_all") - def test_list_all_command(self, mock_list_all: MagicMock) -> None: - test_args = ["main.py", "list", "--all"] - with patch("sys.argv", test_args): - main() - - mock_list_all.assert_called_once() diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py deleted file mode 100644 index c999418cb..000000000 --- a/tests/cli/test_total.py +++ /dev/null @@ -1,610 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later -""" -Tests for the CLI total functionality. -""" - -import unittest -from http.client import IncompleteRead -from pathlib import Path -from unittest.mock import MagicMock, call, patch -from urllib.error import HTTPError - -import yaml - -from scribe_data.cli.total import ( - get_datatype_list, - get_qid_by_input, - get_total_lexemes, - total_wrapper, -) -from scribe_data.utils import ( - DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - WIKIDATA_QIDS_PIDS_FILE, - check_qid_is_language, -) - -try: - with WIKIDATA_QIDS_PIDS_FILE.open("r", encoding="utf-8") as file: - wikidata_qids_pids = yaml.safe_load(file) - -except (IOError, yaml.YAMLError) as e: - print(f"Error reading wikidata QIDs/PIDs metadata: {e}") - - -class TestTotalLexemes(unittest.TestCase): - @patch("scribe_data.cli.total.get_qid_by_input") - @patch("scribe_data.cli.total.sparql.query") - def test_get_total_lexemes_valid( - self, mock_query: MagicMock, mock_get_qid: MagicMock - ) -> None: - mock_get_qid.side_effect = lambda x: {"english": "Q1860", "nouns": "Q1084"}.get( - x.lower() - ) - mock_results = MagicMock() - mock_results.convert.return_value = { - "results": {"bindings": [{"total": {"value": "42"}}]} - } - mock_query.return_value = mock_results - - with patch("builtins.print") as mock_print: - get_total_lexemes(language="English", data_type="nouns") - - mock_print.assert_called_once_with( - "\nLanguage: English\nData type: nouns\nTotal number of lexemes: 42\n" - ) - - @patch("scribe_data.cli.total.get_qid_by_input") - @patch("scribe_data.cli.total.sparql.query") - def test_get_total_lexemes_no_results( - self, mock_query: MagicMock, mock_get_qid: MagicMock - ) -> None: - mock_get_qid.side_effect = lambda x: {"english": "Q1860", "nouns": "Q1084"}.get( - x.lower() - ) - mock_results = MagicMock() - mock_results.convert.return_value = {"results": {"bindings": []}} - mock_query.return_value = mock_results - - with patch("builtins.print") as mock_print: - get_total_lexemes(language="English", data_type="nouns") - - mock_print.assert_called_once_with("Total number of lexemes: Not found") - - @patch("scribe_data.cli.total.get_qid_by_input") - @patch("scribe_data.cli.total.sparql.query") - def test_get_total_lexemes_invalid_language( - self, mock_query: MagicMock, mock_get_qid: MagicMock - ) -> None: - mock_get_qid.side_effect = lambda x: None - mock_query.return_value = MagicMock() - - with patch("builtins.print") as mock_print: - get_total_lexemes(language="InvalidLanguage", data_type="nouns") - - mock_print.assert_called_once_with("Total number of lexemes: Not found") - - @patch("scribe_data.cli.total.get_qid_by_input") - @patch("scribe_data.cli.total.sparql.query") - def test_get_total_lexemes_empty_and_none_inputs( - self, mock_query: MagicMock, mock_get_qid: MagicMock - ) -> None: - mock_get_qid.return_value = None - mock_query.return_value = MagicMock() - - # Call the function with empty and None inputs. - with patch("builtins.print") as mock_print: - get_total_lexemes(language="", data_type="nouns") - get_total_lexemes(language=None, data_type="verbs") - - expected_calls = [ - call("Total number of lexemes: Not found"), - call("Total number of lexemes: Not found"), - ] - mock_print.assert_has_calls(expected_calls, any_order=True) - - @patch("scribe_data.cli.total.get_qid_by_input") - @patch("scribe_data.cli.total.sparql.query") - def test_get_total_lexemes_nonexistent_language( - self, mock_query: MagicMock, mock_get_qid: MagicMock - ) -> None: - mock_get_qid.return_value = None - mock_query.return_value = MagicMock() - - with patch("builtins.print") as mock_print: - get_total_lexemes(language="Martian", data_type="nouns") - - mock_print.assert_called_once_with("Total number of lexemes: Not found") - - @patch("scribe_data.cli.total.get_qid_by_input") - @patch("scribe_data.cli.total.sparql.query") - def test_get_total_lexemes_various_data_types( - self, mock_query: MagicMock, mock_get_qid: MagicMock - ) -> None: - mock_get_qid.side_effect = lambda x: { - "english": "Q1860", - "verbs": "Q24905", - "nouns": "Q1084", - }.get(x.lower()) - mock_results = MagicMock() - mock_results.convert.return_value = { - "results": {"bindings": [{"total": {"value": "30"}}]} - } - - mock_query.return_value = mock_results - - # Call the function with different data types. - with patch("builtins.print") as mock_print: - get_total_lexemes(language="English", data_type="verbs") - get_total_lexemes(language="English", data_type="nouns") - - expected_calls = [ - call( - "\nLanguage: English\nData type: verbs\nTotal number of lexemes: 30\n" - ), - call( - "\nLanguage: English\nData type: nouns\nTotal number of lexemes: 30\n" - ), - ] - mock_print.assert_has_calls(expected_calls) - - @patch("scribe_data.cli.total.get_qid_by_input") - @patch("scribe_data.cli.total.sparql.query") - @patch("scribe_data.cli.total.WIKIDATA_QUERIES_ALL_DATA_DIR") - def test_get_total_lexemes_sub_languages( - self, mock_dir: MagicMock, mock_query: MagicMock, mock_get_qid: MagicMock - ) -> None: - # Setup for sub-languages. - mock_get_qid.side_effect = lambda x: { - "bokmål": "Q25167", - "nynorsk": "Q25164", - }.get(x.lower()) - mock_results = MagicMock() - mock_results.convert.return_value = { - "results": {"bindings": [{"total": {"value": "30"}}]} - } - mock_query.return_value = mock_results - - # Mocking directory paths and contents. - mock_dir.__truediv__.return_value.exists.return_value = True - mock_dir.__truediv__.return_value.iterdir.return_value = [ - MagicMock(name="verbs", is_dir=lambda: True), - MagicMock(name="nouns", is_dir=lambda: True), - ] - - with patch("builtins.print") as mock_print: - get_total_lexemes(language="Norwegian", data_type="verbs") - get_total_lexemes(language="Norwegian", data_type="nouns") - - expected_calls = [ - call( - "\nLanguage: Norwegian\nData type: verbs\nTotal number of lexemes: 30\n" - ), - call( - "\nLanguage: Norwegian\nData type: nouns\nTotal number of lexemes: 30\n" - ), - ] - mock_print.assert_has_calls(expected_calls) - - -class TestGetQidByInput(unittest.TestCase): - def setUp(self) -> None: - self.valid_data_types = { - "english": "Q1860", - "nouns": "Q1084", - "verbs": "Q24905", - } - - @patch("scribe_data.cli.total.data_type_metadata", new_callable=dict) - def test_get_qid_by_input_valid(self, mock_data_type_metadata: MagicMock) -> None: - mock_data_type_metadata.update(self.valid_data_types) - - for data_type, expected_qid in self.valid_data_types.items(): - self.assertEqual(get_qid_by_input(data_type), expected_qid) - - @patch("scribe_data.cli.total.data_type_metadata", new_callable=dict) - def test_get_qid_by_input_invalid(self, mock_data_type_metadata: MagicMock) -> None: - mock_data_type_metadata.update(self.valid_data_types) - - self.assertIsNone(get_qid_by_input("invalid_data_type")) - - -class TestGetDatatypeList(unittest.TestCase): - @patch("scribe_data.cli.total.WIKIDATA_QUERIES_ALL_DATA_DIR") - def test_get_datatype_list_invalid_language(self, mock_dir: MagicMock) -> None: - mock_dir.__truediv__.return_value.exists.return_value = False - - with self.assertRaises(ValueError): - get_datatype_list("InvalidLanguage") - - @patch("scribe_data.cli.total.WIKIDATA_QUERIES_ALL_DATA_DIR") - def test_get_datatype_list_no_data_types(self, mock_dir: MagicMock) -> None: - mock_dir.__truediv__.return_value.exists.return_value = True - mock_dir.__truediv__.return_value.iterdir.return_value = [] - - with self.assertRaises(ValueError): - get_datatype_list("English") - - -class TestCheckQidIsLanguage(unittest.TestCase): - @patch("scribe_data.utils.requests.get") - def test_check_qid_is_language_valid(self, mock_get: MagicMock) -> None: - mock_response = MagicMock() - mock_response.json.return_value = { - "statements": { - wikidata_qids_pids["instance_of"]: [{"value": {"content": "Q34770"}}] - }, - "labels": {"en": "English"}, - } - mock_get.return_value = mock_response - - with patch("builtins.print") as mock_print: - result = check_qid_is_language("Q1860") - - self.assertEqual(result, "English") - mock_print.assert_called_once_with("English (Q1860) is a language.\n") - - @patch("scribe_data.utils.requests.get") - def test_check_qid_is_language_invalid(self, mock_get: MagicMock) -> None: - mock_response = MagicMock() - mock_response.json.return_value = { - "statements": { - wikidata_qids_pids["instance_of"]: [{"value": {"content": "Q5"}}] - }, - "labels": {"en": "Human"}, - } - mock_get.return_value = mock_response - - with self.assertRaises(ValueError): - check_qid_is_language("Q5") - - -class TestTotalWrapper(unittest.TestCase): - @patch("scribe_data.cli.total.print_total_lexemes") - def test_total_wrapper_all_bool(self, mock_print_total_lexemes: MagicMock) -> None: - total_wrapper(all_bool=True) - mock_print_total_lexemes.assert_called_once_with() - - @patch("scribe_data.cli.total.print_total_lexemes") - def test_total_wrapper_language_only( - self, mock_print_total_lexemes: MagicMock - ) -> None: - total_wrapper(languages=["English"]) - mock_print_total_lexemes.assert_called_once_with(language="English") - - @patch("scribe_data.cli.total.get_total_lexemes") - def test_total_wrapper_language_and_data_type( - self, mock_get_total_lexemes_lexemes: MagicMock - ) -> None: - total_wrapper(languages=["English"], data_types=["nouns"]) - mock_get_total_lexemes_lexemes.assert_called_once_with( - language="English", data_type="nouns" - ) - - def test_total_wrapper_invalid_input(self) -> None: - with self.assertRaises(ValueError): - total_wrapper() - - # MARK: Using Dump - - @patch("scribe_data.cli.total.parse_wd_lexeme_dump") - def test_total_wrapper_wikidata_dump_flag(self, mock_parse_dump: MagicMock) -> None: - """ - Test when wikidata_dump is True (flag without path). - """ - total_wrapper(wikidata_dump=True) - mock_parse_dump.assert_called_once_with( - languages=["all"], - data_types=["all"], - wikidata_dump_type=["total"], - wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - ) - - @patch("scribe_data.cli.total.parse_wd_lexeme_dump") - def test_total_wrapper_wikidata_dump_with_all( - self, mock_parse_dump: MagicMock - ) -> None: - """ - Test when both wikidata_dump and all_bool are True. - """ - total_wrapper(wikidata_dump=True, all_bool=True) - mock_parse_dump.assert_called_once_with( - languages=["all"], - data_types=["all"], - wikidata_dump_type=["total"], - wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - ) - - @patch("scribe_data.cli.total.parse_wd_lexeme_dump") - def test_total_wrapper_wikidata_dump_with_language_and_type( - self, mock_parse_dump: MagicMock - ) -> None: - """ - Test wikidata_dump with specific language and data type. - """ - total_wrapper( - languages=["English"], - data_types=["nouns"], - wikidata_dump=Path("/path/to/dump.json"), - ) - mock_parse_dump.assert_called_once_with( - languages=["English"], - data_types=["nouns"], - wikidata_dump_type=["total"], - wikidata_dump_path=Path("/path/to/dump.json"), - ) - - # MARK: Using QID - - @patch("scribe_data.cli.total.check_qid_is_language") - @patch("scribe_data.cli.total.print_total_lexemes") - def test_total_wrapper_with_qid( - self, mock_print_total: MagicMock, mock_check_qid: MagicMock - ) -> None: - """ - Test when language is provided as a QID. - """ - mock_check_qid.return_value = "Thai" - total_wrapper(languages=["Q9217"]) - mock_print_total.assert_called_once_with(language="Q9217") - - @patch("scribe_data.cli.total.check_qid_is_language") - @patch("scribe_data.cli.total.get_total_lexemes") - def test_total_wrapper_with_qid_and_datatype( - self, mock_get_total_lexemes: MagicMock, mock_check_qid: MagicMock - ) -> None: - """ - Test when language QID and data type are provided. - """ - mock_check_qid.return_value = "Thai" - total_wrapper(languages=["Q9217"], data_types=["nouns"]) - mock_get_total_lexemes.assert_called_once_with( - language="Q9217", data_type="nouns" - ) - - @patch("scribe_data.cli.total.parse_wd_lexeme_dump") - def test_total_wrapper_qid_with_wikidata_dump( - self, mock_parse_dump: MagicMock - ) -> None: - """ - Test QID with wikidata dump. - """ - total_wrapper(languages=["Q9217"], wikidata_dump=True, all_bool=True) - mock_parse_dump.assert_called_once_with( - languages=["Q9217"], - data_types=["all"], - wikidata_dump_type=["total"], - wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - ) - - @patch("scribe_data.cli.total.get_total_lexemes") - def test_get_total_lexemes_with_qid( - self, mock_get_total_lexemes: MagicMock - ) -> None: - """ - Test get_total_lexemes with QID input. - """ - total_wrapper(languages=["Q9217"], data_types=["Q1084"]) # Q1084 is noun QID - mock_get_total_lexemes.assert_called_once_with( - language="Q9217", data_type="Q1084" - ) - - # MARK: Multiple Languages and Data Types - - @patch("scribe_data.cli.total.get_total_lexemes") - def test_total_wrapper_multiple_languages( - self, mock_get_total_lexemes: MagicMock - ) -> None: - """ - Test retrieving totals for multiple languages. - """ - # Mock return value to avoid formatting error. - mock_get_total_lexemes.return_value = 100 - - total_wrapper(languages=["English", "German"], data_types=["nouns"]) - - expected_calls = [ - call(language="English", data_type="nouns", do_print=False), - call(language="German", data_type="nouns", do_print=False), - ] - mock_get_total_lexemes.assert_has_calls(expected_calls) - - @patch("scribe_data.cli.total.get_total_lexemes") - def test_total_wrapper_multiple_data_types( - self, mock_get_total_lexemes: MagicMock - ) -> None: - """ - Test retrieving totals for multiple data types. - """ - # Mock return value to avoid formatting error. - mock_get_total_lexemes.return_value = 100 - - total_wrapper(languages=["English"], data_types=["nouns", "verbs"]) - - expected_calls = [ - call(language="English", data_type="nouns", do_print=False), - call(language="English", data_type="verbs", do_print=False), - ] - mock_get_total_lexemes.assert_has_calls(expected_calls) - - @patch("scribe_data.cli.total.get_total_lexemes") - def test_total_wrapper_multiple_languages_and_types( - self, mock_get_total_lexemes: MagicMock - ) -> None: - """ - Test retrieving totals for multiple languages and data types. - """ - # Mock return value to avoid formatting error. - mock_get_total_lexemes.return_value = 100 - - total_wrapper(languages=["English", "German"], data_types=["nouns", "verbs"]) - - expected_calls = [ - call(language="English", data_type="nouns", do_print=False), - call(language="English", data_type="verbs", do_print=False), - call(language="German", data_type="nouns", do_print=False), - call(language="German", data_type="verbs", do_print=False), - ] - mock_get_total_lexemes.assert_has_calls(expected_calls) - - # MARK: Error Handling - - @patch("scribe_data.cli.total.sparql.query") - def test_get_total_lexemes_http_error(self, mock_query: MagicMock) -> None: - """ - Test handling of HTTPError when querying totals. - """ - # Set up mock to return None for results after max retries. - mock_query.side_effect = [ - HTTPError(url="test", code=500, msg="error", hdrs={}, fp=None), - HTTPError(url="test", code=500, msg="error", hdrs={}, fp=None), - HTTPError(url="test", code=500, msg="error", hdrs={}, fp=None), - ] - - with patch("builtins.print") as mock_print: - result = get_total_lexemes(language="English", data_type="nouns") - - self.assertIsNone(result) - mock_print.assert_any_call("Query failed after retries.") - - @patch("scribe_data.cli.total.sparql.query") - def test_get_total_lexemes_incomplete_read(self, mock_query: MagicMock) -> None: - """ - Test handling of IncompleteRead error when querying totals. - """ - # Set up mock to return None for results after max retries. - mock_query.side_effect = [ - IncompleteRead(partial=b""), - IncompleteRead(partial=b""), - IncompleteRead(partial=b""), - ] - - with patch("builtins.print") as mock_print: - result = get_total_lexemes(language="English", data_type="nouns") - - self.assertIsNone(result) - mock_print.assert_any_call("Query failed after retries.") - - # MARK: Sub-language Handling - - @patch("scribe_data.cli.total.get_datatype_list") - @patch("scribe_data.cli.total.get_total_lexemes") - def test_print_total_lexemes_with_sublanguages( - self, mock_get_total_lexemes: MagicMock, mock_get_datatypes: MagicMock - ) -> None: - """ - Test printing totals for a language with sub-languages. - """ - mock_get_datatypes.return_value = ["nouns", "verbs"] - mock_get_total_lexemes.return_value = 100 - - with patch("builtins.print") as mock_print: - total_wrapper(languages=["Norwegian"], data_types=["nouns", "verbs"]) - - # Verify header was printed. - mock_print.assert_any_call( - f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}" - ) - mock_print.assert_any_call("=" * 70) - - # Verify data was printed for each data type. - mock_get_total_lexemes.assert_any_call( - language="Norwegian", data_type="nouns", do_print=False - ) - mock_get_total_lexemes.assert_any_call( - language="Norwegian", data_type="verbs", do_print=False - ) - - # MARK: Data Type List Handling - - @patch("scribe_data.cli.total.language_metadata") - @patch("scribe_data.cli.total.list_all_languages") - @patch("scribe_data.cli.total.WIKIDATA_QUERIES_ALL_DATA_DIR") - def test_get_datatype_list_with_sublanguages( - self, - mock_dir: MagicMock, - mock_list_languages: MagicMock, - mock_metadata: MagicMock, - ) -> None: - """ - Test getting data type list for a language with sub-languages. - """ - # Mock language metadata and list_all_languages. - mock_metadata_dict = { - "norwegian": { - "sub_languages": {"bokmal": {"iso": "nb"}, "nynorsk": {"iso": "nn"}} - } - } - - # Mock dictionary-like behavior for language_metadata. - mock_metadata.__iter__.return_value = mock_metadata_dict.items() - mock_metadata.items.return_value = mock_metadata_dict.items() - mock_metadata.get.return_value = mock_metadata_dict["norwegian"] - mock_metadata.__getitem__.return_value = mock_metadata_dict["norwegian"] - - mock_list_languages.return_value = ["norwegian"] - - # Create mock directory entries with proper string names. - mock_nouns = MagicMock() - mock_nouns.name = "nouns" - mock_nouns.is_dir.return_value = True - - mock_verbs = MagicMock() - mock_verbs.name = "verbs" - mock_verbs.is_dir.return_value = True - - # Mock directory structure for both sub-languages. - def mock_path_handler(path: str) -> MagicMock: - mock_path = MagicMock() - mock_path.exists.return_value = True - mock_path.iterdir.return_value = [mock_nouns, mock_verbs] - return mock_path - - mock_dir.__truediv__.side_effect = mock_path_handler - - result = get_datatype_list("norwegian") # note: lowercase - self.assertEqual(sorted(result), ["nouns", "verbs"]) - - @patch("scribe_data.cli.total.language_metadata") - @patch("scribe_data.cli.total.WIKIDATA_QUERIES_ALL_DATA_DIR") - def test_get_datatype_list_empty_directory( - self, mock_dir: MagicMock, mock_metadata: MagicMock - ) -> None: - """ - Test getting data type list from an empty directory. - """ - # Mock language metadata. - mock_metadata.get.return_value = {} - - mock_dir.__truediv__.return_value.exists.return_value = True - mock_dir.__truediv__.return_value.iterdir.return_value = [] - - with self.assertRaises(ValueError): - get_datatype_list("English") - - @patch("scribe_data.cli.total.get_total_lexemes") - def test_total_wrapper_with_invalid_language( - self, mock_get_total_lexemes: MagicMock - ) -> None: - """ - Test total wrapper with invalid language. - """ - mock_get_total_lexemes.side_effect = ValueError("Invalid language") - - with self.assertRaises(ValueError): - total_wrapper(languages=["invalid_lang"], data_types=["nouns"]) - - mock_get_total_lexemes.assert_called_once() - - @patch("scribe_data.cli.total.get_total_lexemes") - def test_total_wrapper_with_invalid_data_type( - self, mock_get_total_lexemes: MagicMock - ) -> None: - """ - Test total wrapper with invalid data type. - """ - mock_get_total_lexemes.side_effect = ValueError("Invalid data type") - - with self.assertRaises(ValueError): - total_wrapper(languages=["English"], data_types=["invalid_type"]) - - mock_get_total_lexemes.assert_called_once() diff --git a/tests/cli/total/test_cli_total_query.py b/tests/cli/total/test_cli_total_query.py new file mode 100644 index 000000000..b208ccac1 --- /dev/null +++ b/tests/cli/total/test_cli_total_query.py @@ -0,0 +1,249 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI total query functionality. +""" + +import unittest +from unittest.mock import MagicMock, call, patch + +import yaml + +from scribe_data.cli.total.print_values import get_datatype_list +from scribe_data.cli.total.query import get_qid_by_input, query_total_lexemes +from scribe_data.utils import WIKIDATA_QIDS_PIDS_FILE, check_qid_is_language + +try: + with WIKIDATA_QIDS_PIDS_FILE.open("r", encoding="utf-8") as file: + wikidata_qids_pids = yaml.safe_load(file) + +except (IOError, yaml.YAMLError) as e: + print(f"Error reading wikidata QIDs/PIDs metadata: {e}") + +# MARK: Query + + +class TestCLITotalQuery(unittest.TestCase): + @patch("scribe_data.cli.total.query.get_qid_by_input") + @patch("scribe_data.wikidata.wikidata_utils.sparql.query") + def test_cli_total_query_valid( + self, mock_query: MagicMock, mock_get_qid: MagicMock + ) -> None: + mock_get_qid.side_effect = lambda x: {"english": "Q1860", "nouns": "Q1084"}.get( + x.lower() + ) + mock_results = MagicMock() + mock_results.convert.return_value = { + "results": {"bindings": [{"total": {"value": "42"}}]} + } + mock_query.return_value = mock_results + + with patch("builtins.print") as mock_print: + query_total_lexemes(language="English", data_type="nouns") + + mock_print.assert_called_once_with( + "\nLanguage: English\nData type: nouns\nTotal number of lexemes: 42\n" + ) + + @patch("scribe_data.cli.total.query.get_qid_by_input") + @patch("scribe_data.wikidata.wikidata_utils.sparql.query") + def test_cli_total_query_no_results( + self, mock_query: MagicMock, mock_get_qid: MagicMock + ) -> None: + mock_get_qid.side_effect = lambda x: {"english": "Q1860", "nouns": "Q1084"}.get( + x.lower() + ) + mock_results = MagicMock() + mock_results.convert.return_value = {"results": {"bindings": []}} + mock_query.return_value = mock_results + + with patch("builtins.print") as mock_print: + query_total_lexemes(language="English", data_type="nouns") + + mock_print.assert_called_once_with("Total number of lexemes: Not found") + + @patch("scribe_data.cli.total.query.get_qid_by_input") + @patch("scribe_data.wikidata.wikidata_utils.sparql.query") + def test_cli_total_query_invalid_language( + self, mock_query: MagicMock, mock_get_qid: MagicMock + ) -> None: + mock_get_qid.side_effect = lambda x: None + mock_query.return_value = MagicMock() + + with patch("builtins.print") as mock_print: + query_total_lexemes(language="InvalidLanguage", data_type="nouns") + + mock_print.assert_called_once_with("Total number of lexemes: Not found") + + @patch("scribe_data.cli.total.query.get_qid_by_input") + @patch("scribe_data.wikidata.wikidata_utils.sparql.query") + def test_cli_total_query_empty_and_none_inputs( + self, mock_query: MagicMock, mock_get_qid: MagicMock + ) -> None: + mock_get_qid.return_value = None + mock_query.return_value = MagicMock() + + # Call the function with empty and None inputs. + with patch("builtins.print") as mock_print: + query_total_lexemes(language="", data_type="nouns") + query_total_lexemes(language=None, data_type="verbs") + + expected_calls = [ + call("Total number of lexemes: Not found"), + call("Total number of lexemes: Not found"), + ] + mock_print.assert_has_calls(expected_calls, any_order=True) + + @patch("scribe_data.cli.total.query.get_qid_by_input") + @patch("scribe_data.wikidata.wikidata_utils.sparql.query") + def test_cli_total_query_nonexistent_language( + self, mock_query: MagicMock, mock_get_qid: MagicMock + ) -> None: + mock_get_qid.return_value = None + mock_query.return_value = MagicMock() + + with patch("builtins.print") as mock_print: + query_total_lexemes(language="Martian", data_type="nouns") + + mock_print.assert_called_once_with("Total number of lexemes: Not found") + + @patch("scribe_data.cli.total.query.get_qid_by_input") + @patch("scribe_data.wikidata.wikidata_utils.sparql.query") + def test_cli_total_query_various_data_types( + self, mock_query: MagicMock, mock_get_qid: MagicMock + ) -> None: + mock_get_qid.side_effect = lambda x: { + "english": "Q1860", + "verbs": "Q24905", + "nouns": "Q1084", + }.get(x.lower()) + mock_results = MagicMock() + mock_results.convert.return_value = { + "results": {"bindings": [{"total": {"value": "30"}}]} + } + + mock_query.return_value = mock_results + + # Call the function with different data types. + with patch("builtins.print") as mock_print: + query_total_lexemes(language="English", data_type="verbs") + query_total_lexemes(language="English", data_type="nouns") + + expected_calls = [ + call( + "\nLanguage: English\nData type: verbs\nTotal number of lexemes: 30\n" + ), + call( + "\nLanguage: English\nData type: nouns\nTotal number of lexemes: 30\n" + ), + ] + mock_print.assert_has_calls(expected_calls) + + @patch("scribe_data.cli.total.query.get_qid_by_input") + @patch("scribe_data.wikidata.wikidata_utils.sparql.query") + @patch("scribe_data.utils.WIKIDATA_QUERIES_ALL_DATA_DIR") + def test_cli_total_query_sub_languages( + self, mock_dir: MagicMock, mock_query: MagicMock, mock_get_qid: MagicMock + ) -> None: + # Setup for sub-languages. + mock_get_qid.side_effect = lambda x: { + "bokmål": "Q25167", + "nynorsk": "Q25164", + }.get(x.lower()) + mock_results = MagicMock() + mock_results.convert.return_value = { + "results": {"bindings": [{"total": {"value": "30"}}]} + } + mock_query.return_value = mock_results + + # Mocking directory paths and contents. + mock_dir.__truediv__.return_value.exists.return_value = True + mock_dir.__truediv__.return_value.iterdir.return_value = [ + MagicMock(name="verbs", is_dir=lambda: True), + MagicMock(name="nouns", is_dir=lambda: True), + ] + + with patch("builtins.print") as mock_print: + query_total_lexemes(language="Norwegian", data_type="verbs") + query_total_lexemes(language="Norwegian", data_type="nouns") + + expected_calls = [ + call( + "\nLanguage: Norwegian\nData type: verbs\nTotal number of lexemes: 30\n" + ), + call( + "\nLanguage: Norwegian\nData type: nouns\nTotal number of lexemes: 30\n" + ), + ] + mock_print.assert_has_calls(expected_calls) + + +class TestGetQidByInput(unittest.TestCase): + def setUp(self) -> None: + self.valid_data_types = { + "english": "Q1860", + "nouns": "Q1084", + "verbs": "Q24905", + } + + @patch("scribe_data.utils.data_type_metadata", new_callable=dict) + def test_get_qid_by_input_valid(self, mock_data_type_metadata: MagicMock) -> None: + mock_data_type_metadata.update(self.valid_data_types) + + for data_type, expected_qid in self.valid_data_types.items(): + self.assertEqual(get_qid_by_input(data_type), expected_qid) + + @patch("scribe_data.utils.data_type_metadata", new_callable=dict) + def test_get_qid_by_input_invalid(self, mock_data_type_metadata: MagicMock) -> None: + mock_data_type_metadata.update(self.valid_data_types) + + self.assertIsNone(get_qid_by_input("invalid_data_type")) + + +class TestGetDatatypeList(unittest.TestCase): + @patch("scribe_data.utils.WIKIDATA_QUERIES_ALL_DATA_DIR") + def test_get_datatype_list_invalid_language(self, mock_dir: MagicMock) -> None: + mock_dir.__truediv__.return_value.exists.return_value = False + + with self.assertRaises(ValueError): + get_datatype_list("InvalidLanguage") + + @patch("scribe_data.utils.WIKIDATA_QUERIES_ALL_DATA_DIR") + def test_get_datatype_list_no_data_types(self, mock_dir: MagicMock) -> None: + mock_dir.__truediv__.return_value.exists.return_value = True + mock_dir.__truediv__.return_value.iterdir.return_value = [] + + with self.assertRaises(ValueError): + get_datatype_list("English") + + +class TestCheckQidIsLanguage(unittest.TestCase): + @patch("scribe_data.utils.requests.get") + def test_check_qid_is_language_valid(self, mock_get: MagicMock) -> None: + mock_response = MagicMock() + mock_response.json.return_value = { + "statements": { + wikidata_qids_pids["instance_of"]: [{"value": {"content": "Q34770"}}] + }, + "labels": {"en": "English"}, + } + mock_get.return_value = mock_response + + with patch("builtins.print") as mock_print: + result = check_qid_is_language("Q1860") + + self.assertEqual(result, "English") + mock_print.assert_called_once_with("English (Q1860) is a language.\n") + + @patch("scribe_data.utils.requests.get") + def test_check_qid_is_language_invalid(self, mock_get: MagicMock) -> None: + mock_response = MagicMock() + mock_response.json.return_value = { + "statements": { + wikidata_qids_pids["instance_of"]: [{"value": {"content": "Q5"}}] + }, + "labels": {"en": "Human"}, + } + mock_get.return_value = mock_response + + with self.assertRaises(ValueError): + check_qid_is_language("Q5") diff --git a/tests/cli/total/test_cli_total_wrapper.py b/tests/cli/total/test_cli_total_wrapper.py new file mode 100644 index 000000000..fa0595cbf --- /dev/null +++ b/tests/cli/total/test_cli_total_wrapper.py @@ -0,0 +1,384 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Tests for the CLI total wrapper functionality. +""" + +import unittest +from http.client import IncompleteRead +from pathlib import Path +from unittest.mock import MagicMock, call, patch +from urllib.error import HTTPError + +import yaml + +from scribe_data.cli.total.print_values import get_datatype_list +from scribe_data.cli.total.query import query_total_lexemes +from scribe_data.cli.total.wrapper import total_wrapper +from scribe_data.utils import DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, WIKIDATA_QIDS_PIDS_FILE + +try: + with WIKIDATA_QIDS_PIDS_FILE.open("r", encoding="utf-8") as file: + wikidata_qids_pids = yaml.safe_load(file) + +except (IOError, yaml.YAMLError) as e: + print(f"Error reading wikidata QIDs/PIDs metadata: {e}") + +# MARK: Wrapper + + +class TestCLITotalWrapper(unittest.TestCase): + @patch("scribe_data.cli.total.print_total_lexemes") + def test_cli_total_wrapper_all_bool( + self, mock_print_total_lexemes: MagicMock + ) -> None: + total_wrapper(all_bool=True) + mock_print_total_lexemes.assert_called_once_with() + + @patch("scribe_data.cli.total.print_total_lexemes") + def test_cli_total_wrapper_language_only( + self, mock_print_total_lexemes: MagicMock + ) -> None: + total_wrapper(languages=["English"]) + mock_print_total_lexemes.assert_called_once_with(language="English") + + @patch("scribe_data.cli.total.query_total_lexemes") + def test_cli_total_wrapper_language_and_data_type( + self, mock_query_total_lexemes_lexemes: MagicMock + ) -> None: + total_wrapper(languages=["English"], data_types=["nouns"]) + mock_query_total_lexemes_lexemes.assert_called_once_with( + language="English", data_type="nouns" + ) + + def test_cli_total_wrapper_invalid_input(self) -> None: + with self.assertRaises(ValueError): + total_wrapper() + + # MARK: Using Dump + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_cli_total_wrapper_wikidata_dump_flag( + self, mock_parse_dump: MagicMock + ) -> None: + """ + Test when wikidata_dump is True (flag without path). + """ + total_wrapper(wikidata_dump=True) + mock_parse_dump.assert_called_once_with( + languages=["all"], + data_types=["all"], + wikidata_dump_type=["total"], + wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, + ) + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_cli_total_wrapper_wikidata_dump_with_all( + self, mock_parse_dump: MagicMock + ) -> None: + """ + Test when both wikidata_dump and all_bool are True. + """ + total_wrapper(wikidata_dump=True, all_bool=True) + mock_parse_dump.assert_called_once_with( + languages=["all"], + data_types=["all"], + wikidata_dump_type=["total"], + wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, + ) + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_cli_total_wrapper_wikidata_dump_with_language_and_type( + self, mock_parse_dump: MagicMock + ) -> None: + """ + Test wikidata_dump with specific language and data type. + """ + total_wrapper( + languages=["English"], + data_types=["nouns"], + wikidata_dump=Path("/path/to/dump.json"), + ) + mock_parse_dump.assert_called_once_with( + languages=["English"], + data_types=["nouns"], + wikidata_dump_type=["total"], + wikidata_dump_path=Path("/path/to/dump.json"), + ) + + # MARK: Using QID + + @patch("scribe_data.cli.total.check_qid_is_language") + @patch("scribe_data.cli.total.print_total_lexemes") + def test_cli_total_wrapper_with_qid( + self, mock_print_total: MagicMock, mock_check_qid: MagicMock + ) -> None: + """ + Test when language is provided as a QID. + """ + mock_check_qid.return_value = "Thai" + total_wrapper(languages=["Q9217"]) + mock_print_total.assert_called_once_with(language="Q9217") + + @patch("scribe_data.cli.total.check_qid_is_language") + @patch("scribe_data.cli.total.query_total_lexemes") + def test_cli_total_wrapper_with_qid_and_datatype( + self, mock_query_total_lexemes: MagicMock, mock_check_qid: MagicMock + ) -> None: + """ + Test when language QID and data type are provided. + """ + mock_check_qid.return_value = "Thai" + total_wrapper(languages=["Q9217"], data_types=["nouns"]) + mock_query_total_lexemes.assert_called_once_with( + language="Q9217", data_type="nouns" + ) + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_cli_total_wrapper_qid_with_wikidata_dump( + self, mock_parse_dump: MagicMock + ) -> None: + """ + Test QID with wikidata dump. + """ + total_wrapper(languages=["Q9217"], wikidata_dump=True, all_bool=True) + mock_parse_dump.assert_called_once_with( + languages=["Q9217"], + data_types=["all"], + wikidata_dump_type=["total"], + wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, + ) + + @patch("scribe_data.cli.total.query_total_lexemes") + def test_cli_total_wrapper_query_total_lexemes_with_qid( + self, mock_query_total_lexemes: MagicMock + ) -> None: + """ + Test query_total_lexemes with QID input. + """ + total_wrapper(languages=["Q9217"], data_types=["Q1084"]) # Q1084 is noun QID + mock_query_total_lexemes.assert_called_once_with( + language="Q9217", data_type="Q1084" + ) + + # MARK: Multiple Languages and Data Types + + @patch("scribe_data.cli.total.query_total_lexemes") + def test_cli_total_wrapper_multiple_languages( + self, mock_query_total_lexemes: MagicMock + ) -> None: + """ + Test retrieving totals for multiple languages. + """ + # Mock return value to avoid formatting error. + mock_query_total_lexemes.return_value = 100 + + total_wrapper(languages=["English", "German"], data_types=["nouns"]) + + expected_calls = [ + call(language="English", data_type="nouns", do_print=False), + call(language="German", data_type="nouns", do_print=False), + ] + mock_query_total_lexemes.assert_has_calls(expected_calls) + + @patch("scribe_data.cli.total.query_total_lexemes") + def test_cli_total_wrapper_multiple_data_types( + self, mock_query_total_lexemes: MagicMock + ) -> None: + """ + Test retrieving totals for multiple data types. + """ + # Mock return value to avoid formatting error. + mock_query_total_lexemes.return_value = 100 + + total_wrapper(languages=["English"], data_types=["nouns", "verbs"]) + + expected_calls = [ + call(language="English", data_type="nouns", do_print=False), + call(language="English", data_type="verbs", do_print=False), + ] + mock_query_total_lexemes.assert_has_calls(expected_calls) + + @patch("scribe_data.cli.total.query_total_lexemes") + def test_total_wrapper_multiple_languages_and_types( + self, mock_query_total_lexemes: MagicMock + ) -> None: + """ + Test retrieving totals for multiple languages and data types. + """ + # Mock return value to avoid formatting error. + mock_query_total_lexemes.return_value = 100 + + total_wrapper(languages=["English", "German"], data_types=["nouns", "verbs"]) + + expected_calls = [ + call(language="English", data_type="nouns", do_print=False), + call(language="English", data_type="verbs", do_print=False), + call(language="German", data_type="nouns", do_print=False), + call(language="German", data_type="verbs", do_print=False), + ] + mock_query_total_lexemes.assert_has_calls(expected_calls) + + # MARK: Error Handling + + @patch("scribe_data.cli.total.sparql.query") + def test_cli_query_total_lexemes_http_error(self, mock_query: MagicMock) -> None: + """ + Test handling of HTTPError when querying totals. + """ + # Set up mock to return None for results after max retries. + mock_query.side_effect = [ + HTTPError(url="test", code=500, msg="error", hdrs={}, fp=None), + HTTPError(url="test", code=500, msg="error", hdrs={}, fp=None), + HTTPError(url="test", code=500, msg="error", hdrs={}, fp=None), + ] + + with patch("builtins.print") as mock_print: + result = query_total_lexemes(language="English", data_type="nouns") + + self.assertIsNone(result) + mock_print.assert_any_call("Query failed after retries.") + + @patch("scribe_data.cli.total.sparql.query") + def test_cli_query_total_lexemes_incomplete_read( + self, mock_query: MagicMock + ) -> None: + """ + Test handling of IncompleteRead error when querying totals. + """ + # Set up mock to return None for results after max retries. + mock_query.side_effect = [ + IncompleteRead(partial=b""), + IncompleteRead(partial=b""), + IncompleteRead(partial=b""), + ] + + with patch("builtins.print") as mock_print: + result = query_total_lexemes(language="English", data_type="nouns") + + self.assertIsNone(result) + mock_print.assert_any_call("Query failed after retries.") + + # MARK: Sub-language Handling + + @patch("scribe_data.cli.total.get_datatype_list") + @patch("scribe_data.cli.total.query_total_lexemes") + def test_cli_total_wrapper_print_total_lexemes_with_sublanguages( + self, mock_query_total_lexemes: MagicMock, mock_get_datatypes: MagicMock + ) -> None: + """ + Test printing totals for a language with sub-languages. + """ + mock_get_datatypes.return_value = ["nouns", "verbs"] + mock_query_total_lexemes.return_value = 100 + + with patch("builtins.print") as mock_print: + total_wrapper(languages=["Norwegian"], data_types=["nouns", "verbs"]) + + # Verify header was printed. + mock_print.assert_any_call( + f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}" + ) + mock_print.assert_any_call("=" * 70) + + # Verify data was printed for each data type. + mock_query_total_lexemes.assert_any_call( + language="Norwegian", data_type="nouns", do_print=False + ) + mock_query_total_lexemes.assert_any_call( + language="Norwegian", data_type="verbs", do_print=False + ) + + # MARK: Data Type List Handling + + @patch("scribe_data.cli.total.language_metadata") + @patch("scribe_data.cli.total.list_all_languages") + @patch("scribe_data.utils.WIKIDATA_QUERIES_ALL_DATA_DIR") + def test_cli_get_datatype_list_with_sublanguages( + self, + mock_dir: MagicMock, + mock_list_languages: MagicMock, + mock_metadata: MagicMock, + ) -> None: + """ + Test getting data type list for a language with sub-languages. + """ + # Mock language metadata and list_all_languages. + mock_metadata_dict = { + "norwegian": { + "sub_languages": {"bokmal": {"iso": "nb"}, "nynorsk": {"iso": "nn"}} + } + } + + # Mock dictionary-like behavior for language_metadata. + mock_metadata.__iter__.return_value = mock_metadata_dict.items() + mock_metadata.items.return_value = mock_metadata_dict.items() + mock_metadata.get.return_value = mock_metadata_dict["norwegian"] + mock_metadata.__getitem__.return_value = mock_metadata_dict["norwegian"] + + mock_list_languages.return_value = ["norwegian"] + + # Create mock directory entries with proper string names. + mock_nouns = MagicMock() + mock_nouns.name = "nouns" + mock_nouns.is_dir.return_value = True + + mock_verbs = MagicMock() + mock_verbs.name = "verbs" + mock_verbs.is_dir.return_value = True + + # Mock directory structure for both sub-languages. + def mock_path_handler(path: str) -> MagicMock: + mock_path = MagicMock() + mock_path.exists.return_value = True + mock_path.iterdir.return_value = [mock_nouns, mock_verbs] + return mock_path + + mock_dir.__truediv__.side_effect = mock_path_handler + + result = get_datatype_list("norwegian") # note: lowercase + self.assertEqual(sorted(result), ["nouns", "verbs"]) + + @patch("scribe_data.cli.total.language_metadata") + @patch("scribe_data.utils.WIKIDATA_QUERIES_ALL_DATA_DIR") + def test_cli_get_datatype_list_empty_directory( + self, mock_dir: MagicMock, mock_metadata: MagicMock + ) -> None: + """ + Test getting data type list from an empty directory. + """ + # Mock language metadata. + mock_metadata.get.return_value = {} + + mock_dir.__truediv__.return_value.exists.return_value = True + mock_dir.__truediv__.return_value.iterdir.return_value = [] + + with self.assertRaises(ValueError): + get_datatype_list("English") + + @patch("scribe_data.cli.total.query_total_lexemes") + def test_cli_total_wrapper_with_invalid_language( + self, mock_query_total_lexemes: MagicMock + ) -> None: + """ + Test total wrapper with invalid language. + """ + mock_query_total_lexemes.side_effect = ValueError("Invalid language") + + with self.assertRaises(ValueError): + total_wrapper(languages=["invalid_lang"], data_types=["nouns"]) + + mock_query_total_lexemes.assert_called_once() + + @patch("scribe_data.cli.total.query_total_lexemes") + def test_cli_total_wrapper_with_invalid_data_type( + self, mock_query_total_lexemes: MagicMock + ) -> None: + """ + Test total wrapper with invalid data type. + """ + mock_query_total_lexemes.side_effect = ValueError("Invalid data type") + + with self.assertRaises(ValueError): + total_wrapper(languages=["English"], data_types=["invalid_type"]) + + mock_query_total_lexemes.assert_called_once() diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py deleted file mode 100644 index 2d107ad7a..000000000 --- a/tests/load/test_update_utils.py +++ /dev/null @@ -1,177 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later -""" -Tests for the update_utils file functions. -""" - -import sys -from pathlib import Path - -import pytest - -sys.path.append(Path(__file__).parent.parent.parent) - -from scribe_data import utils - - -@pytest.mark.parametrize( - "language, qid_code", - [ - ("english", "Q1860"), - ("french", "Q150"), - ("german", "Q188"), - ("italian", "Q652"), - ("portuguese", "Q5146"), - ("russian", "Q7737"), - ("spanish", "Q1321"), - ("swedish", "Q9027"), - ("bokmål", "Q25167"), - ], -) -def test_get_language_qid_positive(language: str, qid_code: str) -> None: - assert utils.get_language_qid(language) == qid_code - - -def test_get_language_qid_negative() -> None: - with pytest.raises(ValueError) as excp: - _ = utils.get_language_qid("Newspeak") - - assert ( - str(excp.value) - == "Newspeak is currently not a supported language for QID conversion." - ) - - -@pytest.mark.parametrize( - "language, iso_code", - [ - ("english", "en"), - ("french", "fr"), - ("german", "de"), - ("italian", "it"), - ("portuguese", "pt"), - ("russian", "ru"), - ("spanish", "es"), - ("swedish", "sv"), - ("bokmål", "nb"), - ], -) -def test_get_language_iso_positive(language: str, iso_code: str) -> None: - assert utils.get_language_iso(language) == iso_code - - -def test_get_language_iso_negative() -> None: - with pytest.raises(ValueError) as excp: - _ = utils.get_language_iso("Gibberish") - - assert ( - str(excp.value) - == "Gibberish is currently not a supported language for ISO conversion." - ) - - -@pytest.mark.parametrize( - "iso_code, language", - [ - ("en", "English"), - ("fr", "French"), - ("de", "German"), - ("it", "Italian"), - ("pt", "Portuguese"), - ("ru", "Russian"), - ("es", "Spanish"), - ("sv", "Swedish"), - ("nb", "Bokmål"), - ], -) -def test_get_language_from_iso_positive(iso_code: str, language: str) -> None: - assert utils.get_language_from_iso(iso_code) == language - - -def test_get_language_from_iso_negative() -> None: - with pytest.raises(ValueError) as excp: - _ = utils.get_language_from_iso("ixi") - - assert str(excp.value) == "IXI is currently not a supported ISO language." - - -@pytest.mark.parametrize( - "lang, expected_output", - [ - ("nynorsk", "nynorsk norwegian"), - ("bokmål", "bokmål norwegian"), - ("english", "english"), - ], -) -def test_format_sublanguage_name_positive(lang: str, expected_output: str) -> None: - assert utils.format_sublanguage_name(lang) == expected_output - - -@pytest.mark.parametrize( - "lang, expected_output", - [ - ("Q42", "Q42"), # test that any QID is returned - ("Q1860", "Q1860"), - ], -) -def test_format_sublanguage_name_qid_positive(lang: str, expected_output: str) -> None: - assert utils.format_sublanguage_name(lang) == expected_output - - -def test_format_sublanguage_name_negative() -> None: - with pytest.raises(ValueError) as excp: - _ = utils.format_sublanguage_name("Newspeak") - - assert str(excp.value) == "Newspeak is not a valid language or sub-language." - - -def test_list_all_languages() -> None: - expected_languages = [ - "arabic", - "basque", - "bengali", - "bokmål norwegian", - "czech", - "dagbani", - "danish", - "english", - "esperanto", - "estonian", - "finnish", - "french", - "german", - "greek", - "gurmukhi punjabi", - "hausa", - "hebrew", - "hindi hindustani", - "igbo", - "indonesian", - "italian", - "japanese", - "korean", - "kurmanji", - "latin", - "latvian", - "malay", - "malayalam", - "mandarin chinese", - "nigerian pidgin", - "northern sami", - "nynorsk norwegian", - "persian", - "polish", - "portuguese", - "russian", - "shahmukhi punjabi", - "slovak", - "spanish", - "swahili", - "swedish", - "tajik", - "tamil", - "ukrainian", - "urdu hindustani", - "yoruba", - ] - - assert utils.list_all_languages() == expected_languages diff --git a/tests/resources/test_metadata.py b/tests/resources/test_resources_metadata.py similarity index 84% rename from tests/resources/test_metadata.py rename to tests/resources/test_resources_metadata.py index 70f490c7d..e23f8a496 100644 --- a/tests/resources/test_metadata.py +++ b/tests/resources/test_resources_metadata.py @@ -15,7 +15,7 @@ ) -class TestFileAccessibility(TestCase): +class TestResourcesMetadataFileAccessibility(TestCase): def check_file_exists(self, file_path: pathlib.Path) -> None: """ Helper method to check if a file exists. @@ -39,25 +39,25 @@ def check_file_readable(self, file_path: pathlib.Path) -> None: # Catching any other file reading error self.fail(f"Failed to read {file_path}: {str(e)}") - def test_language_metadata_file_exists(self) -> None: + def test_resources_language_metadata_file_exists(self) -> None: """ Check if the language_metadata.yaml file exists. """ self.check_file_exists(LANGUAGE_METADATA_PATH) - def test_language_metadata_file_readable(self) -> None: + def test_resources_language_metadata_file_readable(self) -> None: """ Check if the language_metadata.yaml file is readable. """ self.check_file_readable(LANGUAGE_METADATA_PATH) - def test_data_type_metadata_file_exists(self) -> None: + def test_resources_data_type_metadata_file_exists(self) -> None: """ Check if the data_type_metadata.yaml file exists. """ self.check_file_exists(DATA_TYPE_METADATA_PATH) - def test_data_type_metadata_file_readable(self) -> None: + def test_resources_data_type_metadata_file_readable(self) -> None: """ Check if the data_type_metadata.yaml file is readable. """ diff --git a/tests/unicode/test_generate_emoji_keywords.py b/tests/unicode/test_unicode_generate_emoji_keywords.py similarity index 94% rename from tests/unicode/test_generate_emoji_keywords.py rename to tests/unicode/test_unicode_generate_emoji_keywords.py index 858f89848..980542e5c 100644 --- a/tests/unicode/test_generate_emoji_keywords.py +++ b/tests/unicode/test_unicode_generate_emoji_keywords.py @@ -46,7 +46,7 @@ def mock_process_unicode() -> Iterator[MagicMock]: yield mock_lexicon -def test_generate_emoji_success( +def test_unicode_generate_emoji_success( mock_pyicu: tuple[MagicMock, MagicMock], mock_utils: tuple[MagicMock, MagicMock], mock_process_unicode: MagicMock, @@ -70,7 +70,7 @@ def test_generate_emoji_success( mock_export.assert_called_once() -def test_generate_emoji_pyicu_not_installed( +def test_unicode_generate_emoji_pyicu_not_installed( mock_pyicu: tuple[MagicMock, MagicMock], ) -> None: mock_check_install, mock_check_installed = mock_pyicu @@ -83,7 +83,7 @@ def test_generate_emoji_pyicu_not_installed( mock_check_installed.assert_called_once() -def test_generate_emoji_unsupported_language( +def test_unicode_generate_emoji_unsupported_language( mock_pyicu: tuple[MagicMock, MagicMock], mock_utils: tuple[MagicMock, MagicMock], tmp_path: Path, @@ -102,7 +102,7 @@ def test_generate_emoji_unsupported_language( mock_iso.assert_called_once_with(language="xx") -def test_generate_emoji_output_dir_handling( +def test_unicode_generate_emoji_output_dir_handling( mock_pyicu: tuple[MagicMock, MagicMock], mock_utils: tuple[MagicMock, MagicMock], mock_process_unicode: MagicMock, diff --git a/tests/wikidata/test_check_query.py b/tests/wikidata/test_wikidata_check_query.py similarity index 83% rename from tests/wikidata/test_check_query.py rename to tests/wikidata/test_wikidata_check_query.py index 882f844d8..5abe91191 100755 --- a/tests/wikidata/test_check_query.py +++ b/tests/wikidata/test_wikidata_check_query.py @@ -46,42 +46,42 @@ def a_query() -> QueryFile: # MARK: Query -def test_full_path(a_query: QueryFile) -> None: +def test_wikidata_full_path(a_query: QueryFile) -> None: assert a_query.path == A_PATH @patch("builtins.open", new_callable=mock_open, read_data="QUERY") -def test_query_load(_: MagicMock, a_query: QueryFile) -> None: +def test_wikidata_query_load(_: MagicMock, a_query: QueryFile) -> None: assert a_query.load(12) == "QUERY\nLIMIT 12\n" -def test_query_equals(a_query: QueryFile) -> None: +def test_wikidata_query_equals(a_query: QueryFile) -> None: assert a_query == QueryFile(A_PATH) -def test_query_not_equals(a_query: QueryFile) -> None: +def test_wikidata_query_not_equals(a_query: QueryFile) -> None: assert a_query != QueryFile(normalize_path("/root/project/src/Dir/query.sparql")) -def test_query_not_equals_object(a_query: QueryFile) -> None: +def test_wikidata_query_not_equals_object(a_query: QueryFile) -> None: assert a_query != object() -def test_query_str(a_query: QueryFile) -> None: +def test_wikidata_query_str(a_query: QueryFile) -> None: assert ( str(a_query) == f"QueryFile(path={normalize_path('/root/project/src/dir/query.sparql')})" ) -def test_query_repr(a_query: QueryFile) -> None: +def test_wikidata_query_repr(a_query: QueryFile) -> None: assert ( repr(a_query) == f"QueryFile(path={normalize_path('/root/project/src/dir/query.sparql')})" ) -def test_query_execution_exception(a_query: QueryFile) -> None: +def test_wikidata_query_execution_exception(a_query: QueryFile) -> None: exception = QueryExecutionException("failure", a_query) assert str(exception) == f"{S_PATH} : failure" @@ -90,7 +90,7 @@ def test_query_execution_exception(a_query: QueryFile) -> None: @patch("urllib.request.urlopen") -def test_ping_pass(mock_urlopen: MagicMock) -> None: +def test_wikidata_ping_pass(mock_urlopen: MagicMock) -> None: mock_urlopen.return_value.__enter__.return_value.getcode.return_value = ( HTTPStatus.OK ) @@ -98,19 +98,19 @@ def test_ping_pass(mock_urlopen: MagicMock) -> None: @patch("urllib.request.urlopen") -def test_ping_httperror_fail(mock_urlopen: MagicMock) -> None: +def test_wikidata_ping_httperror_fail(mock_urlopen: MagicMock) -> None: mock_urlopen.return_value.__enter__.side_effect = HTTPError assert not ping("http://www.python.org", 0) @patch("urllib.request.urlopen") -def test_ping_exception_fail(mock_urlopen: MagicMock) -> None: +def test_wikidata_ping_exception_fail(mock_urlopen: MagicMock) -> None: mock_urlopen.return_value.__enter__.side_effect = Exception assert not ping("http://www.python.org", 0) @patch("urllib.request.urlopen") -def test_ping_fail(mock_urlopen: MagicMock) -> None: +def test_wikidata_ping_fail(mock_urlopen: MagicMock) -> None: mock_urlopen.return_value.__enter__.return_value.getcode.return_value = ( HTTPStatus.BAD_REQUEST ) @@ -121,12 +121,12 @@ def test_ping_fail(mock_urlopen: MagicMock) -> None: @patch.object(Path, "is_file", return_value=True) -def test_check_sparql_file_exists(_: MagicMock) -> None: +def test_wikidata_check_sparql_file_exists(_: MagicMock) -> None: assert check_sparql_file(S_PATH) == A_PATH @patch.object(Path, "is_file", return_value=False) -def test_check_sparql_file_not_exists(_: MagicMock) -> None: +def test_wikidata_check_sparql_file_not_exists(_: MagicMock) -> None: with pytest.raises(argparse.ArgumentTypeError) as err: _ = check_sparql_file(S_PATH) @@ -134,7 +134,7 @@ def test_check_sparql_file_not_exists(_: MagicMock) -> None: @patch.object(Path, "is_file", return_value=True) -def test_check_sparql_file_not_sparql_extension(_: MagicMock) -> None: +def test_wikidata_check_sparql_file_not_sparql_extension(_: MagicMock) -> None: fpath = Path("/root/query.txt") with pytest.raises(argparse.ArgumentTypeError) as err: _ = check_sparql_file(fpath) @@ -162,7 +162,7 @@ def test_check_sparql_file_not_sparql_extension(_: MagicMock) -> None: ], ) @patch("subprocess.run") -def test_changed_queries( +def test_wikidata_changed_queries( mock_run: MagicMock, git_status: str, expected: list[Any] ) -> None: mock_result = MagicMock() @@ -173,7 +173,7 @@ def test_changed_queries( @patch("subprocess.run") -def test_changed_queries_failure( +def test_wikidata_changed_queries_failure( mock_run: MagicMock, capsys: pytest.CaptureFixture ) -> None: mock_result = MagicMock() @@ -208,7 +208,7 @@ def test_changed_queries_failure( ), ], ) -def test_all_queries(tree: list[Any], expected: list[Any]) -> None: +def test_wikidata_all_queries(tree: list[Any], expected: list[Any]) -> None: with patch("os.walk") as mock_walk: mock_walk.return_value = tree @@ -216,7 +216,7 @@ def test_all_queries(tree: list[Any], expected: list[Any]) -> None: # MARK: execute -def test_execute(a_query: QueryFile) -> None: +def test_wikidata_execute(a_query: QueryFile) -> None: with pytest.raises(QueryExecutionException) as err: _ = execute(a_query, 1, None, 0) @@ -232,7 +232,7 @@ def test_execute(a_query: QueryFile) -> None: ("1000", 1000), ], ) -def test_check_limit_pos(candidate: str, limit: int) -> None: +def test_wikidata_check_limit_pos(candidate: str, limit: int) -> None: assert check_limit(candidate) == limit @@ -245,7 +245,7 @@ def test_check_limit_pos(candidate: str, limit: int) -> None: "word", ], ) -def test_check_limit_neg(candidate: str) -> None: +def test_wikidata_check_limit_neg(candidate: str) -> None: with pytest.raises(argparse.ArgumentTypeError) as err: _ = check_limit(candidate) @@ -263,7 +263,7 @@ def test_check_limit_neg(candidate: str) -> None: ("8888", 8888), ], ) -def test_check_timeout_pos(candidate: str, timeout: int) -> None: +def test_wikidata_check_timeout_pos(candidate: str, timeout: int) -> None: assert check_timeout(candidate) == timeout @@ -276,7 +276,7 @@ def test_check_timeout_pos(candidate: str, timeout: int) -> None: "ten", ], ) -def test_check_timeout_neg(candidate: str) -> None: +def test_wikidata_check_timeout_neg(candidate: str) -> None: with pytest.raises(argparse.ArgumentTypeError) as err: _ = check_timeout(candidate) @@ -287,7 +287,7 @@ def test_check_timeout_neg(candidate: str) -> None: @pytest.mark.parametrize("arg", ["-h", "--help"]) -def test_main_help(arg: str) -> None: +def test_wikidata_main_help(arg: str) -> None: with pytest.raises(SystemExit) as err: _ = main(arg) assert err.code == 0 @@ -304,7 +304,7 @@ def test_main_help(arg: str) -> None: ["-c", "-f", "-a"], ], ) -def test_main_mutex_opts(args: list[str]) -> None: +def test_wikidata_main_mutex_opts(args: list[str]) -> None: """ Some options cannot be used together. """ @@ -313,7 +313,9 @@ def test_main_mutex_opts(args: list[str]) -> None: assert err.code == 2 -def test_error_report_single(a_query: QueryFile, capsys: pytest.CaptureFixture) -> None: +def test_wikidata_error_report_single( + a_query: QueryFile, capsys: pytest.CaptureFixture +) -> None: failures = [QueryExecutionException("timeout", a_query)] error_report(failures) err_out = capsys.readouterr().err @@ -323,7 +325,7 @@ def test_error_report_single(a_query: QueryFile, capsys: pytest.CaptureFixture) ) -def test_error_report_multiple( +def test_wikidata_error_report_multiple( a_query: QueryFile, capsys: pytest.CaptureFixture ) -> None: failures = [ @@ -339,12 +341,12 @@ def test_error_report_multiple( ) -def test_error_report_no_errors(capsys: pytest.CaptureFixture) -> None: +def test_wikidata_error_report_no_errors(capsys: pytest.CaptureFixture) -> None: error_report([]) assert capsys.readouterr().err == "" -def test_success_report_single_display_set( +def test_wikidata_success_report_single_display_set( a_query: QueryFile, capsys: pytest.CaptureFixture ) -> None: successes = [(a_query, {"a": 23})] @@ -356,7 +358,9 @@ def test_success_report_single_display_set( ) -def test_success_report_no_success_display_set(capsys: pytest.CaptureFixture) -> None: +def test_wikidata_success_report_no_success_display_set( + capsys: pytest.CaptureFixture, +) -> None: success_report([], display=True) assert capsys.readouterr().out == "" @@ -365,7 +369,7 @@ def test_success_report_no_success_display_set(capsys: pytest.CaptureFixture) -> "successes", [[], [(a_query, {"a": 23})], [(a_query, {"a": 23}), (a_query, {"b": 53})]], ) -def test_success_report_display_not_set( +def test_wikidata_success_report_display_not_set( successes: list[Any], capsys: pytest.CaptureFixture ) -> None: success_report(successes, display=False) @@ -373,7 +377,7 @@ def test_success_report_display_not_set( assert out == "" -def test_success_report_multiple_display_set( +def test_wikidata_success_report_multiple_display_set( a_query: QueryFile, capsys: pytest.CaptureFixture ) -> None: successes = [(a_query, {"a": 23}), (a_query, {"b": 57})] @@ -389,14 +393,14 @@ def test_success_report_multiple_display_set( # MARK: check_query_forms -def test_qid_label_dict_not_empty() -> None: +def test_wikidata_qid_label_dict_not_empty() -> None: assert check_query_forms.qid_label_dict, "qid_label_dict should not be empty" # MARK: extract_forms_from_sparql -def test_extract_forms_from_sparql_valid_file(tmp_path: Path) -> None: +def test_wikidata_extract_forms_from_sparql_valid_file(tmp_path: Path) -> None: sparql_file = tmp_path / "test.sparql" # The pattern r"\s\sOPTIONAL\s*\{([^}]*)\}" requires exactly two spaces before OPTIONAL. sparql_file.write_text(" OPTIONAL { form1 } OPTIONAL { form2 }") @@ -404,7 +408,7 @@ def test_extract_forms_from_sparql_valid_file(tmp_path: Path) -> None: assert result == [" form1 ", " form2 "] -def test_extract_forms_from_sparql_no_matches(tmp_path: Path) -> None: +def test_wikidata_extract_forms_from_sparql_no_matches(tmp_path: Path) -> None: sparql_file = tmp_path / "test.sparql" sparql_file.write_text("SELECT * WHERE { }") result = check_query_forms.extract_forms_from_sparql(sparql_file) @@ -412,7 +416,7 @@ def test_extract_forms_from_sparql_no_matches(tmp_path: Path) -> None: @patch("builtins.open", side_effect=Exception("File error")) -def test_extract_forms_from_sparql_exception( +def test_wikidata_extract_forms_from_sparql_exception( mock_open: MagicMock, capsys: pytest.CaptureFixture ) -> None: result = check_query_forms.extract_forms_from_sparql(Path("nonexistent.sparql")) @@ -424,13 +428,13 @@ def test_extract_forms_from_sparql_exception( # MARK: extract_form_rep_label -def test_extract_form_rep_label_valid() -> None: +def test_wikidata_extract_form_rep_label_valid() -> None: form_text = "ontolex:representation ?testLabel ;" result = check_query_forms.extract_form_rep_label(form_text) assert result == "testLabel" -def test_extract_form_rep_label_no_match() -> None: +def test_wikidata_extract_form_rep_label_no_match() -> None: form_text = "invalid text" result = check_query_forms.extract_form_rep_label(form_text) assert result is None @@ -439,7 +443,7 @@ def test_extract_form_rep_label_no_match() -> None: # MARK: decompose_label_features -def test_decompose_label_features_valid() -> None: +def test_wikidata_decompose_label_features_valid() -> None: label = "nominativeSingular" with patch.object( check_query_forms, "lexeme_form_labels_order", ["Nominative", "Singular"] @@ -448,7 +452,7 @@ def test_decompose_label_features_valid() -> None: assert result == ["Nominative", "Singular"] -def test_decompose_label_features_invalid() -> None: +def test_wikidata_decompose_label_features_invalid() -> None: label = "unknownFeature" with patch.object( check_query_forms, "lexeme_form_labels_order", ["Nominative", "Singular"] @@ -457,7 +461,7 @@ def test_decompose_label_features_invalid() -> None: assert result == ["UnknownFeature"] -def test_decompose_label_features_empty() -> None: +def test_wikidata_decompose_label_features_empty() -> None: label = "" result = check_query_forms.decompose_label_features(label) assert result == [] @@ -466,13 +470,13 @@ def test_decompose_label_features_empty() -> None: # MARK: extract_form_qids -def test_extract_form_qids_valid() -> None: +def test_wikidata_extract_form_qids_valid() -> None: form_text = "wikibase:grammaticalFeature wd:Q123, wd:Q456 ." result = check_query_forms.extract_form_qids(form_text) assert result == ["Q123", "Q456"] -def test_extract_form_qids_no_match() -> None: +def test_wikidata_extract_form_qids_no_match() -> None: form_text = "invalid text" result = check_query_forms.extract_form_qids(form_text) assert result is None @@ -481,25 +485,25 @@ def test_extract_form_qids_no_match() -> None: # MARK: check_form_label -def test_check_form_label_match() -> None: +def test_wikidata_check_form_label_match() -> None: form_text = "?lexeme ontolex:lexicalForm ?testForm .\n?testForm ontolex:representation ?test ;" result = check_query_forms.check_form_label(form_text) assert result is True -def test_check_form_label_no_form_label() -> None: +def test_wikidata_check_form_label_no_form_label() -> None: form_text = "invalid text" result = check_query_forms.check_form_label(form_text) assert result is False -def test_check_form_label_no_rep_label() -> None: +def test_wikidata_check_form_label_no_rep_label() -> None: form_text = "?lexeme ontolex:lexicalForm ?testForm ." result = check_query_forms.check_form_label(form_text) assert result is False -def test_check_form_label_mismatch() -> None: +def test_wikidata_check_form_label_mismatch() -> None: form_text = "?lexeme ontolex:lexicalForm ?testForm .\n?testForm ontolex:representation ?other ;" result = check_query_forms.check_form_label(form_text) assert result is False @@ -508,19 +512,19 @@ def test_check_form_label_mismatch() -> None: # MARK: check_query_formatting -def test_check_query_formatting_valid() -> None: +def test_wikidata_check_query_formatting_valid() -> None: form_text = "valid . text ;" result = check_query_forms.check_query_formatting(form_text) assert result is True -def test_check_query_formatting_space_before_comma() -> None: +def test_wikidata_check_query_formatting_space_before_comma() -> None: form_text = "invalid , text" result = check_query_forms.check_query_formatting(form_text) assert result is False -def test_check_query_formatting_nonspace_before_period() -> None: +def test_wikidata_check_query_formatting_nonspace_before_period() -> None: form_text = "invalid.text" result = check_query_forms.check_query_formatting(form_text) assert result is False @@ -529,7 +533,7 @@ def test_check_query_formatting_nonspace_before_period() -> None: # MARK: return_correct_form_label -def test_return_correct_form_label_valid() -> None: +def test_wikidata_return_correct_form_label_valid() -> None: qids = ["Q123"] with patch.object(check_query_forms, "lexeme_form_qid_order", ["Q123"]): with patch.object( @@ -541,12 +545,12 @@ def test_return_correct_form_label_valid() -> None: assert result == "nominative" -def test_return_correct_form_label_empty() -> None: +def test_wikidata_return_correct_form_label_empty() -> None: result = check_query_forms.return_correct_form_label([]) assert result == "Invalid query formatting found" -def test_return_correct_form_label_not_included() -> None: +def test_wikidata_return_correct_form_label_not_included() -> None: qids = ["Q999"] with patch.object(check_query_forms, "lexeme_form_qid_order", ["Q123"]): result = check_query_forms.return_correct_form_label(qids) @@ -554,22 +558,19 @@ def test_return_correct_form_label_not_included() -> None: def validate_forms(query_text: str) -> str: - errors = [] - - # Extract SELECT variables + # Extract SELECT variables. select_match = re.search(r"SELECT\s+([^{]+)\s+WHERE", query_text, re.IGNORECASE) if not select_match: return "Invalid query format: no SELECT match" select_vars = [ - var.strip() for var in select_match.group(1).split() if var.startswith("?") + var.strip() for var in select_match[1].split() if var.startswith("?") ] - # Extract variables defined in WHERE clause. - where_vars = set() # Pattern for ontolex:representation variables (forms). forms_pattern = r"ontolex:representation\s+\?(\w+)" - for match in re.finditer(forms_pattern, query_text): - where_vars.add(f"?{match.group(1)}") + where_vars = { + f"?{match.group(1)}" for match in re.finditer(forms_pattern, query_text) + } # Add other variables defined in WHERE (e.g., bound variables). # Example: ?lexeme, ?lemma, ?lastModified, ?formLex. @@ -578,6 +579,7 @@ def validate_forms(query_text: str) -> str: where_vars.add(f"?{match.group(1)}") where_vars.add(f"?{match.group(2)}") + errors = [] # Check for duplicates in SELECT. select_vars_set = set(select_vars) if len(select_vars_set) < len(select_vars): @@ -620,7 +622,7 @@ def validate_forms(query_text: str) -> str: # MARK: validate_forms -def test_validate_forms_valid() -> None: +def test_wikidata_validate_forms_valid() -> None: # Ensure all variables in SELECT are defined in WHERE and order matches. # Use ontolex:representation to define ?form so it matches forms_pattern. query_text = """ @@ -640,13 +642,13 @@ def test_validate_forms_valid() -> None: assert result == "" -def test_validate_forms_no_select() -> None: +def test_wikidata_validate_forms_no_select() -> None: query_text = "WHERE { }" result = check_query_forms.validate_forms(query_text) assert result == "Invalid query format: no SELECT match" -def test_validate_forms_duplicates() -> None: +def test_wikidata_validate_forms_duplicates() -> None: query_text = """ SELECT ?lexeme @@ -665,7 +667,7 @@ def test_validate_forms_duplicates() -> None: assert "Duplicate forms found in SELECT: form" in result -def test_validate_forms_undefined() -> None: +def test_wikidata_validate_forms_undefined() -> None: query_text = """ SELECT ?lexeme @@ -681,7 +683,7 @@ def test_validate_forms_undefined() -> None: assert "Undefined forms found in SELECT: form" in result -def test_validate_forms_unreturned() -> None: +def test_wikidata_validate_forms_unreturned() -> None: query_text = """ SELECT ?lexeme @@ -699,7 +701,7 @@ def test_validate_forms_unreturned() -> None: assert "Defined but unreturned forms found: formRep" in result -def test_validate_forms_order_mismatch() -> None: +def test_wikidata_validate_forms_order_mismatch() -> None: # Ensure variables are defined, then create an order mismatch. # Both ?form and ?formRep must be captured by forms_pattern. query_text = """ @@ -728,13 +730,13 @@ def test_validate_forms_order_mismatch() -> None: # MARK: check_docstring -def test_check_docstring_valid() -> None: +def test_wikidata_check_docstring_valid() -> None: query_text = "# tool: scribe-data\n# All nouns (Q123) and verbs (Q456) and the given forms.\n# Enter this query at https://query.wikidata.org/.\n" result = check_query_forms.check_docstring(query_text) assert result is True -def test_check_docstring_invalid_line1() -> None: +def test_wikidata_check_docstring_invalid_line1() -> None: query_text = "# wrong tool\n# All nouns (Q123) and verbs (Q456) and the given forms.\n# Enter this query at https://query.wikidata.org/.\n" result = check_query_forms.check_docstring(query_text) assert result == (False, "Error in line 1: # wrong tool") @@ -743,7 +745,7 @@ def test_check_docstring_invalid_line1() -> None: # MARK: check_forms_order -def test_check_forms_order_valid() -> None: +def test_wikidata_check_forms_order_valid() -> None: query_text = """ SELECT ?lexeme @@ -763,7 +765,7 @@ def test_check_forms_order_valid() -> None: assert result is True -def test_check_forms_order_invalid(capsys: pytest.CaptureFixture) -> None: +def test_wikidata_check_forms_order_invalid(capsys: pytest.CaptureFixture) -> None: query_text = """ SELECT ?lexeme @@ -788,7 +790,7 @@ def test_check_forms_order_invalid(capsys: pytest.CaptureFixture) -> None: # MARK: check_optional_qid_order -def test_check_optional_qid_order_valid(tmp_path: Path) -> None: +def test_wikidata_check_optional_qid_order_valid(tmp_path: Path) -> None: sparql_file = tmp_path / "test.sparql" sparql_file.write_text( " OPTIONAL { ?lexeme ontolex:lexicalForm ?form . ?form ontolex:representation ?nominative ; wikibase:grammaticalFeature wd:Q123 . }" @@ -798,7 +800,7 @@ def test_check_optional_qid_order_valid(tmp_path: Path) -> None: assert result == "" -def test_check_optional_qid_order_invalid(tmp_path: Path) -> None: +def test_wikidata_check_optional_qid_order_invalid(tmp_path: Path) -> None: sparql_file = tmp_path / "test.sparql" sparql_file.write_text( " OPTIONAL { ?lexeme ontolex:lexicalForm ?form . ?form ontolex:representation ?nominative ; wikibase:grammaticalFeature wd:Q456 . }" @@ -814,7 +816,7 @@ def test_check_optional_qid_order_invalid(tmp_path: Path) -> None: @patch("pathlib.Path.glob", return_value=[]) -def test_check_query_forms_no_files( +def test_wikidata_check_query_forms_no_files( mock_glob: MagicMock, capsys: pytest.CaptureFixture ) -> None: # Mock WIKIDATA_QUERIES_ALL_DATA_DIR as a Path object with the patched glob. @@ -827,7 +829,7 @@ def test_check_query_forms_no_files( @patch("pathlib.Path.glob") -def test_check_query_forms_with_errors( +def test_wikidata_check_query_forms_with_errors( mock_glob: MagicMock, tmp_path: Path, capsys: pytest.CaptureFixture ) -> None: sparql_file = tmp_path / "test.sparql" diff --git a/tests/cli/test_dump.py b/tests/wikidata/test_wikidata_dump.py similarity index 93% rename from tests/cli/test_dump.py rename to tests/wikidata/test_wikidata_dump.py index 70218c80a..f56be809c 100644 --- a/tests/cli/test_dump.py +++ b/tests/wikidata/test_wikidata_dump.py @@ -59,7 +59,9 @@ def lexeme_processor() -> LexemeProcessor: ) -def test_lexeme_processor_initialization(lexeme_processor: LexemeProcessor) -> None: +def test_wikidata_lexeme_processor_initialization( + lexeme_processor: LexemeProcessor, +) -> None: """ Test LexemeProcessor initialization with basic parameters. """ @@ -71,7 +73,7 @@ def test_lexeme_processor_initialization(lexeme_processor: LexemeProcessor) -> N @patch("builtins.open", new_callable=mock_open, read_data=Sample_Lexeme_Line) @patch("bz2.open") -def test_process_file( +def test_wikidata_process_file( mock_bz2_open: MagicMock, mock_file: MagicMock, lexeme_processor: LexemeProcessor ) -> None: """ @@ -89,7 +91,7 @@ def test_process_file( @patch("scribe_data.wikidata.parse_dump.LexemeProcessor") -def test_parse_dump(mock_processor: MagicMock) -> None: +def test_wikidata_parse_dump(mock_processor: MagicMock) -> None: """ Test the parse_dump function. """ @@ -105,7 +107,7 @@ def test_parse_dump(mock_processor: MagicMock) -> None: @patch("scribe_data.wikidata.wikidata_utils.Path") @patch("scribe_data.wikidata.wikidata_utils.wd_lexeme_dump_download_wrapper") @patch("scribe_data.wikidata.wikidata_utils.parse_dump") -def test_parse_wd_lexeme_dump( +def test_wikidata_parse_wd_lexeme_dump( mock_parse_dump: MagicMock, mock_download: MagicMock, mock_path_class: MagicMock ) -> None: """ @@ -159,7 +161,7 @@ def test_parse_wd_lexeme_dump( assert kwargs["data_types"] == ["nouns"] -def test_parse_wd_lexeme_dump_no_file() -> None: +def test_wikidata_parse_wd_lexeme_dump_no_file() -> None: """ Test parse_wd_lexeme_dump when no file is found. """ @@ -186,7 +188,7 @@ def test_parse_wd_lexeme_dump_no_file() -> None: ({"total": True}, True), ], ) -def test_parse_types(test_input: dict[str, bool], expected: bool) -> None: +def test_wikidata_parse_types(test_input: dict[str, bool], expected: bool) -> None: """ Test different parse types. """ diff --git a/tests/wikidata/test_query_data.py b/tests/wikidata/test_wikidata_query_data.py similarity index 98% rename from tests/wikidata/test_query_data.py rename to tests/wikidata/test_wikidata_query_data.py index f04298c8e..329d7551e 100644 --- a/tests/wikidata/test_query_data.py +++ b/tests/wikidata/test_wikidata_query_data.py @@ -16,7 +16,7 @@ class TestQueryData(unittest.TestCase): @patch("subprocess.run") @patch("sys.executable", return_value="python") - def test_execute_formatting_script( + def test_wikidata_execute_formatting_script( self, mock_executable: MagicMock, mock_run: MagicMock ) -> None: """ @@ -55,7 +55,7 @@ def test_execute_formatting_script( "/output/dir", "German", "nouns" ) # should print error but not raise exceptions - def test_query_data_multiple_intervals(self) -> None: + def test_wikidata_query_data_multiple_intervals(self) -> None: """ Test query_data with multiple query intervals. """ @@ -166,7 +166,7 @@ def test_query_data_multiple_intervals(self) -> None: out.getvalue(), ) - def test_query_data_single_query_error(self) -> None: + def test_wikidata_query_data_single_query_error(self) -> None: """ Test that query_data handles a single query returning None. """ @@ -242,7 +242,7 @@ def test_query_data_single_query_error(self) -> None: # Check that execute_formatting_script is not called. mock_exec.assert_not_called() - def test_query_data_multiple_intervals_error(self) -> None: + def test_wikidata_query_data_multiple_intervals_error(self) -> None: """ Test query_data with multiple query intervals where the second query throws an HTTPError and subsequent queries return None. diff --git a/tests/wiktionary/test_parse_translations.py b/tests/wiktionary/test_wiktionary_parse_translations.py similarity index 100% rename from tests/wiktionary/test_parse_translations.py rename to tests/wiktionary/test_wiktionary_parse_translations.py From 9f6ee8c8dce8a5e7ba3b20391562f4584167e77e Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 12 Jun 2026 09:15:35 -0700 Subject: [PATCH 2/2] WIP commit to detail changes - will change direction --- src/scribe_data/cli/contracts/check.py | 15 ++-- src/scribe_data/cli/contracts/filter.py | 21 +++--- src/scribe_data/cli/convert/to_csv_or_tsv.py | 13 ++-- src/scribe_data/cli/convert/to_json.py | 4 +- src/scribe_data/cli/convert/to_sqlite.py | 24 +++--- src/scribe_data/cli/convert/wrapper.py | 10 +-- .../cli/download/wikidata_lexeme_dump.py | 10 +-- .../cli/download/wiktionary_dump.py | 6 +- src/scribe_data/cli/get.py | 48 +++--------- src/scribe_data/cli/interactive/config.py | 10 +-- src/scribe_data/cli/interactive/execute.py | 13 ---- src/scribe_data/cli/interactive/prompt.py | 8 +- src/scribe_data/cli/interactive/run.py | 38 +--------- src/scribe_data/cli/total/wrapper.py | 3 - src/scribe_data/unicode/process_unicode.py | 4 +- src/scribe_data/utils.py | 19 +++-- src/scribe_data/wikidata/parse_dump.py | 21 +++--- src/scribe_data/wikidata/query_data.py | 6 +- src/scribe_data/wikidata/wikidata_utils.py | 15 +--- .../wiktionary/parse_translations.py | 15 ++-- .../cli/contracts/test_cli_contracts_check.py | 26 ++++--- .../contracts/test_cli_contracts_export.py | 9 ++- .../convert/test_cli_convert_to_csv_or_tsv.py | 14 ++-- tests/cli/convert/test_cli_convert_to_json.py | 14 ++-- .../cli/convert/test_cli_convert_to_sqlite.py | 18 ++--- tests/cli/convert/test_cli_convert_wrapper.py | 14 ++-- .../test_cli_download_wikidata_lexeme_dump.py | 22 +++--- .../test_cli_interactive_config.py | 12 ++- .../test_cli_interactive_execute.py | 8 +- .../test_cli_interactive_prompt.py | 6 +- tests/cli/test_cli_get.py | 16 ++-- tests/cli/total/test_cli_total_wrapper.py | 74 ++++++++----------- tests/conftest.py | 53 +++++++++++++ tests/wikidata/test_wikidata_dump.py | 4 +- .../test_wiktionary_parse_translations.py | 10 +-- 35 files changed, 277 insertions(+), 326 deletions(-) create mode 100644 tests/conftest.py diff --git a/src/scribe_data/cli/contracts/check.py b/src/scribe_data/cli/contracts/check.py index 89171221a..0c35c371e 100644 --- a/src/scribe_data/cli/contracts/check.py +++ b/src/scribe_data/cli/contracts/check.py @@ -8,7 +8,7 @@ from scribe_data.cli.contracts.filter import ( DEFAULT_DATA_CONTRACTS_DIR, - DEFAULT_JSON_EXPORT_DIR, + DEFAULT_JSON_DIR, filter_contract_metadata, ) from scribe_data.utils import get_language_from_iso, get_language_iso @@ -25,7 +25,7 @@ def check_contract_data_completeness( - contracts_dir: Path, + contracts_dir: Path = DEFAULT_DATA_CONTRACTS_DIR, ) -> dict[str, dict[str, list[str]]]: """ Validate exported data contracts against their metadata requirements. @@ -54,7 +54,10 @@ def check_contract_data_completeness( The above is the expected structure. """ # Determine languages to check. - languages_to_check = [Path(f).stem.lower() for f in contracts_dir.glob("*.yaml")] + languages_to_check = [ + get_language_from_iso(iso=Path(f).stem.lower()) + for f in contracts_dir.glob("*.yaml") + ] languages_to_check = [ lang for lang in languages_to_check @@ -80,7 +83,7 @@ def check_contract_data_completeness( # Get contract metadata. contract_metadata = filter_contract_metadata(contract_file) - export_lang_dir = DEFAULT_JSON_EXPORT_DIR / lang_dir_name + export_lang_dir = DEFAULT_JSON_DIR / lang_dir_name # Check missing forms for nouns and verbs. lang_missing_forms = {} @@ -166,9 +169,9 @@ def check_contract_data_print_missing(contracts_dir: Path) -> None: contracts_dir = Path(contracts_dir) if contracts_dir else DEFAULT_DATA_CONTRACTS_DIR - if not DEFAULT_JSON_EXPORT_DIR.exists(): + if not contracts_dir.exists(): print( - f"Error: Directory {DEFAULT_JSON_EXPORT_DIR} does not exist.\nPlease use export JSON first." + f"Error: Directory {contracts_dir} does not exist.\nPlease provide a valid path to the data contracts or don't pass an argument to use the default contracts." ) return diff --git a/src/scribe_data/cli/contracts/filter.py b/src/scribe_data/cli/contracts/filter.py index 853ee28a8..43a81d945 100644 --- a/src/scribe_data/cli/contracts/filter.py +++ b/src/scribe_data/cli/contracts/filter.py @@ -13,8 +13,8 @@ from scribe_data.utils import ( DEFAULT_DATA_CONTRACTS_DIR, - DEFAULT_FILTERED_JSON_EXPORT_DIR, - DEFAULT_JSON_EXPORT_DIR, + DEFAULT_FILTERED_JSON_DIR, + DEFAULT_JSON_DIR, get_language_from_iso, ) @@ -245,7 +245,7 @@ def export_data_filtered_by_contracts(contracts_dir: Path) -> None: contracts_dir = Path(contracts_dir) if contracts_dir else DEFAULT_DATA_CONTRACTS_DIR # Use provided output dir or default. - DEFAULT_FILTERED_JSON_EXPORT_DIR.mkdir(parents=True, exist_ok=True) + DEFAULT_FILTERED_JSON_DIR.mkdir(parents=True, exist_ok=True) for contract_filename in os.listdir(contracts_dir): if not contract_filename.endswith(".yaml"): @@ -267,15 +267,14 @@ def export_data_filtered_by_contracts(contracts_dir: Path) -> None: continue # Create language directory in export path. - lang_export_dir = ( - DEFAULT_FILTERED_JSON_EXPORT_DIR - / matched_language.lower().replace(" ", "_") + lang_export_dir = DEFAULT_FILTERED_JSON_DIR / matched_language.lower().replace( + " ", "_" ) lang_export_dir.mkdir(parents=True, exist_ok=True) - lang_input_dir = Path( - DEFAULT_JSON_EXPORT_DIR - ) / matched_language.lower().replace(" ", "_") + lang_input_dir = Path(DEFAULT_JSON_DIR) / matched_language.lower().replace( + " ", "_" + ) if not lang_input_dir.exists(): print(f"No input directory found for {matched_language}") continue @@ -289,7 +288,7 @@ def export_data_filtered_by_contracts(contracts_dir: Path) -> None: # Skip unsupported types if needed. if data_type not in contract_metadata: output_file = ( - DEFAULT_FILTERED_JSON_EXPORT_DIR + DEFAULT_FILTERED_JSON_DIR / matched_language.lower().replace(" ", "_") / f"{data_type}.json" ) @@ -308,7 +307,7 @@ def export_data_filtered_by_contracts(contracts_dir: Path) -> None: input_file, contract_metadata, data_type ): output_file = ( - DEFAULT_FILTERED_JSON_EXPORT_DIR + DEFAULT_FILTERED_JSON_DIR / matched_language.lower().replace(" ", "_") / f"{data_type}.json" ) diff --git a/src/scribe_data/cli/convert/to_csv_or_tsv.py b/src/scribe_data/cli/convert/to_csv_or_tsv.py index b3197129d..43bbe2e16 100644 --- a/src/scribe_data/cli/convert/to_csv_or_tsv.py +++ b/src/scribe_data/cli/convert/to_csv_or_tsv.py @@ -8,9 +8,9 @@ from pathlib import Path from scribe_data.utils import ( - DEFAULT_CSV_EXPORT_DIR, - DEFAULT_JSON_EXPORT_DIR, - DEFAULT_TSV_EXPORT_DIR, + DEFAULT_CSV_DIR, + DEFAULT_JSON_DIR, + DEFAULT_TSV_DIR, camel_to_snake, check_index_exists, ) @@ -61,8 +61,7 @@ def convert_to_csv_or_tsv( # Modify input file path to use the provided input_file or default JSON export path. input_file_path = ( - input_file - or DEFAULT_JSON_EXPORT_DIR / language.lower() / f"{data_types[0]}.json" + input_file or DEFAULT_JSON_DIR / language.lower() / f"{data_types[0]}.json" ) for dtype in data_types: @@ -81,9 +80,7 @@ def convert_to_csv_or_tsv( # Determine the delimiter based on output type. delimiter = "," if output_type == "csv" else "\t" - output_dir = ( - DEFAULT_CSV_EXPORT_DIR if output_type == "csv" else DEFAULT_TSV_EXPORT_DIR - ) + output_dir = DEFAULT_CSV_DIR if output_type == "csv" else DEFAULT_TSV_DIR final_output_dir = output_dir / language.capitalize() final_output_dir.mkdir(parents=True, exist_ok=True) diff --git a/src/scribe_data/cli/convert/to_json.py b/src/scribe_data/cli/convert/to_json.py index 37d052842..4d6fde147 100644 --- a/src/scribe_data/cli/convert/to_json.py +++ b/src/scribe_data/cli/convert/to_json.py @@ -8,7 +8,7 @@ from pathlib import Path from scribe_data.utils import ( - DEFAULT_JSON_EXPORT_DIR, + DEFAULT_JSON_DIR, camel_to_snake, check_index_exists, ) @@ -60,7 +60,7 @@ def convert_to_json( if not data_types: return - json_output_dir = Path(DEFAULT_JSON_EXPORT_DIR) / language.capitalize() + json_output_dir = Path(DEFAULT_JSON_DIR) / language.capitalize() json_output_dir.mkdir(parents=True, exist_ok=True) for dtype in data_types: diff --git a/src/scribe_data/cli/convert/to_sqlite.py b/src/scribe_data/cli/convert/to_sqlite.py index 903ab0751..19a085d77 100644 --- a/src/scribe_data/cli/convert/to_sqlite.py +++ b/src/scribe_data/cli/convert/to_sqlite.py @@ -13,8 +13,8 @@ from tqdm.auto import tqdm from scribe_data.utils import ( - DEFAULT_JSON_EXPORT_DIR, - DEFAULT_SQLITE_EXPORT_DIR, + DEFAULT_JSON_DIR, + DEFAULT_SQLITE_DIR, camel_to_snake, data_type_metadata, get_language_iso, @@ -97,7 +97,7 @@ def translations_to_sqlite( language_data_type_dict: dict, current_languages: list, identifier_case: str = "snake", - input_file: Path = DEFAULT_JSON_EXPORT_DIR, + input_file: Path = DEFAULT_JSON_DIR, overwrite: bool = False, ) -> None: """ @@ -114,14 +114,14 @@ def translations_to_sqlite( identifier_case : str, optional The identifier case. Default is "snake". - input_file : str, optional, default=DEFAULT_JSON_EXPORT_DIR + input_file : str, optional, default=DEFAULT_JSON_DIR The input JSON export directory. overwrite : bool, optional If True, existing SQLite files will be overwritten without prompting. """ maybe_over = "" - translation_db_path = Path(DEFAULT_SQLITE_EXPORT_DIR) / "TranslationData.sqlite" + translation_db_path = Path(DEFAULT_SQLITE_DIR) / "TranslationData.sqlite" if translation_db_path.exists(): if not overwrite: answer = questionary.confirm( @@ -188,7 +188,7 @@ def translations_to_sqlite( def wiktionary_translations_to_sqlite( language, identifier_case="snake", - input_file=DEFAULT_JSON_EXPORT_DIR, + input_file=DEFAULT_JSON_DIR, overwrite: bool = False, ): """ @@ -208,7 +208,7 @@ def wiktionary_translations_to_sqlite( identifier_case : str, optional Either "camel" or "snake" to determine column naming. Default is "snake". - input_file : str, optional, default=DEFAULT_JSON_EXPORT_DIR + input_file : str, optional, default=DEFAULT_JSON_DIR The input JSON export directory. overwrite : bool, optional @@ -232,7 +232,7 @@ def wiktionary_translations_to_sqlite( if not translation_files: return - db_path = Path(DEFAULT_SQLITE_EXPORT_DIR) / "TranslationData.sqlite" + db_path = Path(DEFAULT_SQLITE_DIR) / "TranslationData.sqlite" db_path.parent.mkdir(parents=True, exist_ok=True) connection = sqlite3.connect(db_path) cursor = connection.cursor() @@ -298,7 +298,7 @@ def convert_to_sqlite( languages: list[str] | None = None, specific_tables: str | list[str] | None = None, identifier_case: str = "camel", - input_file: Path = DEFAULT_JSON_EXPORT_DIR, + input_file: Path = DEFAULT_JSON_DIR, overwrite: bool = False, ) -> None: """ @@ -315,7 +315,7 @@ def convert_to_sqlite( identifier_case : str, optional, default='camel' Format of the identifiers ("camel" or "snake"). Defaults to "camel". - input_file : str, optional, default=DEFAULT_JSON_EXPORT_DIR + input_file : str, optional, default=DEFAULT_JSON_DIR The input JSON export directory. overwrite : bool, optional @@ -328,7 +328,7 @@ def convert_to_sqlite( ) # Ensure the SQLite export directory exists before creating the database. - DEFAULT_SQLITE_EXPORT_DIR.mkdir(parents=True, exist_ok=True) + DEFAULT_SQLITE_DIR.mkdir(parents=True, exist_ok=True) current_language_data = language_metadata data_types = data_type_metadata @@ -428,7 +428,7 @@ def convert_to_sqlite( if language_data_type_dict[lang] != []: maybe_over = "" db_file = ( - Path(DEFAULT_SQLITE_EXPORT_DIR) + Path(DEFAULT_SQLITE_DIR) / f"{get_language_iso(lang).upper()}LanguageData.sqlite" ) if db_file.exists(): diff --git a/src/scribe_data/cli/convert/wrapper.py b/src/scribe_data/cli/convert/wrapper.py index 6293ede5c..0bfd51105 100644 --- a/src/scribe_data/cli/convert/wrapper.py +++ b/src/scribe_data/cli/convert/wrapper.py @@ -9,8 +9,8 @@ from scribe_data.cli.convert.to_json import convert_to_json from scribe_data.cli.convert.to_sqlite import convert_to_sqlite from scribe_data.utils import ( - DEFAULT_JSON_EXPORT_DIR, - DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, + DEFAULT_JSON_DIR, + DEFAULT_WIKTIONARY_JSON_DIR, ) # MARK: Convert Wrapper @@ -61,11 +61,7 @@ def convert_wrapper( isinstance(dt, str) and dt.startswith("wiktionary") for dt in (data_types if isinstance(data_types, list) else [data_types]) ) - input_path = ( - DEFAULT_WIKTIONARY_JSON_EXPORT_DIR - if is_wiktionary - else DEFAULT_JSON_EXPORT_DIR - ) + input_path = DEFAULT_WIKTIONARY_JSON_DIR if is_wiktionary else DEFAULT_JSON_DIR if output_type == "json" and languages and data_types: convert_to_json( diff --git a/src/scribe_data/cli/download/wikidata_lexeme_dump.py b/src/scribe_data/cli/download/wikidata_lexeme_dump.py index 34b042060..7d5733cd0 100644 --- a/src/scribe_data/cli/download/wikidata_lexeme_dump.py +++ b/src/scribe_data/cli/download/wikidata_lexeme_dump.py @@ -16,8 +16,8 @@ from tqdm import tqdm from scribe_data.utils import ( - DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, + DEFAULT_WIKIDATA_DUMP_DIR, + DEFAULT_WIKTIONARY_DUMP_DIR, check_lexeme_dump_prompt_download, ) @@ -230,12 +230,12 @@ def wd_lexeme_dump_download_wrapper( - Returns None if the user chooses not to proceed with the download or no valid dump URL is found. """ try: - os.makedirs(DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, exist_ok=True) + os.makedirs(DEFAULT_WIKIDATA_DUMP_DIR, exist_ok=True) # Don't check for lexeme if date given. if not dump_snapshot: if useable_file_dir := check_lexeme_dump_prompt_download( - DEFAULT_WIKIDATA_DUMP_EXPORT_DIR + DEFAULT_WIKIDATA_DUMP_DIR ): return useable_file_dir @@ -246,7 +246,7 @@ def wd_lexeme_dump_download_wrapper( return None filename = dump_url.split("/")[-1] - output_path = DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR / filename + output_path = DEFAULT_WIKTIONARY_DUMP_DIR / filename # Use default parameter to bypass user confirmation. user_response = ( diff --git a/src/scribe_data/cli/download/wiktionary_dump.py b/src/scribe_data/cli/download/wiktionary_dump.py index 2cb92bc41..e6ca6f065 100644 --- a/src/scribe_data/cli/download/wiktionary_dump.py +++ b/src/scribe_data/cli/download/wiktionary_dump.py @@ -11,7 +11,7 @@ from tqdm import tqdm from scribe_data.utils import ( - DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, + DEFAULT_WIKTIONARY_DUMP_DIR, resolve_lang_iso, ) @@ -59,7 +59,7 @@ def download_wiktionary_dumps( wiktionaries = [f"{iso}wiktionary" for iso in language_isos] wiktionary_urls = [f"https://dumps.wikimedia.org/{w}" for w in wiktionaries] - Path(DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR).mkdir(parents=True, exist_ok=True) + Path(DEFAULT_WIKTIONARY_DUMP_DIR).mkdir(parents=True, exist_ok=True) for i, w, u in zip(language_isos, wiktionaries, wiktionary_urls): # Note: Remove the snapshot from the resulting filename so Scribe-Server always looks for one file. filename = f"{w}-pages-articles.xml.bz2" @@ -75,7 +75,7 @@ def download_wiktionary_dumps( rprint(f"[bold red]Invalid dump date or dump not found: {e}[/bold red]") return None - output_path = DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR / filename + output_path = DEFAULT_WIKTIONARY_DUMP_DIR / filename if output_path.exists(): rprint(f"[bold yellow]Dump already exists: {output_path}[/bold yellow]") diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index b0d5a9f96..6a435f890 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -17,12 +17,8 @@ from scribe_data.cli.convert.wrapper import convert_wrapper from scribe_data.unicode.generate_emoji_keywords import generate_emoji from scribe_data.utils import ( - DEFAULT_CSV_EXPORT_DIR, - DEFAULT_JSON_EXPORT_DIR, - DEFAULT_SQLITE_EXPORT_DIR, - DEFAULT_TSV_EXPORT_DIR, - DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, + DEFAULT_JSON_DIR, + DEFAULT_WIKIDATA_DUMP_DIR, check_index_exists, ) from scribe_data.wikidata.query_data import query_data @@ -82,19 +78,6 @@ def get_data( Dict[str, bool] | None The requested data saved locally given file type and location arguments. """ - # MARK: Defaults - - if data_types == ["translations"]: - output_dir = DEFAULT_WIKTIONARY_JSON_EXPORT_DIR - - else: - output_dir = { - "csv": DEFAULT_CSV_EXPORT_DIR, - "json": DEFAULT_JSON_EXPORT_DIR, - "sqlite": DEFAULT_SQLITE_EXPORT_DIR, - "tsv": DEFAULT_TSV_EXPORT_DIR, - }.get(output_type, DEFAULT_JSON_EXPORT_DIR) - language_or_languages = ( "language" if languages and len(languages) == 1 else "languages" ) @@ -125,7 +108,6 @@ def prompt_user_download_all() -> bool: query_data( languages=[language_or_sub_language], data_types=["all"], - output_dir=output_dir, overwrite=overwrite, ) print( @@ -137,8 +119,6 @@ def prompt_user_download_all() -> bool: languages=languages, data_types=["all"], wikidata_dump_type=["form"], - output_dir=output_dir, - wikidata_dump_path=wikidata_dump_path, overwrite_all=overwrite, ) @@ -149,7 +129,6 @@ def prompt_user_download_all() -> bool: query_data( languages=["all"], data_types=data_types, - output_dir=output_dir, overwrite=overwrite, ) print(f"Query completed for all languages for data type: {data_type}") @@ -159,8 +138,6 @@ def prompt_user_download_all() -> bool: languages=["all"], data_types=data_types, wikidata_dump_type=["form"], - output_dir=output_dir, - wikidata_dump_path=wikidata_dump_path, overwrite_all=overwrite, ) @@ -173,8 +150,6 @@ def prompt_user_download_all() -> bool: languages=["all"], data_types=["all"], wikidata_dump_type=["form", "translations"], - output_dir=output_dir, - wikidata_dump_path=wikidata_dump_path, overwrite_all=overwrite, ) @@ -186,10 +161,7 @@ def prompt_user_download_all() -> bool: and len(data_types) == 1 and data_types[0] in {"emoji-keywords", "emoji_keywords"} ): - generate_emoji( - language=languages[0], # only one possible - output_dir=output_dir, - ) + generate_emoji(language=languages[0]) # only one possible # MARK: Translations @@ -201,7 +173,6 @@ def prompt_user_download_all() -> bool: parse_wiktionary_translations( target_languages=languages, wiktionary_dump_path=wiktionary_dump, - output_dir=output_dir, overwrite=overwrite, ) return @@ -211,14 +182,12 @@ def prompt_user_download_all() -> bool: elif wikidata_dump_path is not None: # If wikidata_dump is an empty string, use the default path. if not wikidata_dump_path: - wikidata_dump_path = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR + wikidata_dump_path = DEFAULT_WIKIDATA_DUMP_DIR parse_wd_lexeme_dump( languages=languages or ["all"], data_types=data_types, wikidata_dump_type=["form"], - output_dir=output_dir, - wikidata_dump_path=wikidata_dump_path, overwrite_all=overwrite, ) return @@ -233,7 +202,9 @@ def prompt_user_download_all() -> bool: f"{', '.join([t.capitalize() for t in data_types])}" ) - json_path = Path(output_dir) / language_or_sub_language / f"{data_type}.json" + json_path = ( + Path(DEFAULT_JSON_DIR) / language_or_sub_language / f"{data_type}.json" + ) if not overwrite and check_index_exists(json_path): print( f"Skipping update for {language_or_sub_language.title()} {data_type}." @@ -257,14 +228,13 @@ def print_error_and_suggestions(error_message: str) -> None: query_data( languages=[language_or_sub_language], data_types=data_types, - output_dir=output_dir, overwrite=overwrite, interactive=interactive, ) # Only print this line if no exception was raised. if not all_bool: - print(f"Updated data was saved in: {Path(output_dir).resolve()}.") + print(f"Updated data was saved in: {Path(DEFAULT_JSON_DIR).resolve()}.") except json.decoder.JSONDecodeError: print_error_and_suggestions( @@ -292,7 +262,7 @@ def print_error_and_suggestions(error_message: str) -> None: # MARK: Output Conversion json_input_path = ( - Path(output_dir) / f"{language_or_sub_language}/{data_type}.json" + Path(DEFAULT_JSON_DIR) / f"{language_or_sub_language}/{data_type}.json" ) if output_type and output_type != "json" and json_input_path.exists(): diff --git a/src/scribe_data/cli/interactive/config.py b/src/scribe_data/cli/interactive/config.py index 13bcf58a3..fa46ff5e5 100644 --- a/src/scribe_data/cli/interactive/config.py +++ b/src/scribe_data/cli/interactive/config.py @@ -6,8 +6,8 @@ from pathlib import Path from scribe_data.utils import ( - DEFAULT_JSON_EXPORT_DIR, - DEFAULT_SQLITE_EXPORT_DIR, + DEFAULT_JSON_DIR, + DEFAULT_SQLITE_DIR, data_type_metadata, language_metadata, list_all_languages, @@ -32,12 +32,12 @@ def __init__(self) -> None: self.selected_languages: list[str] = [] self.selected_data_types: list[str] = [] self.output_type: str = "json" - self.output_dir: Path = DEFAULT_JSON_EXPORT_DIR + self.output_dir: Path = DEFAULT_JSON_DIR self.overwrite: bool = False self.configured: bool = False self.identifier_case: str = "camel" - self.input_dir: Path = DEFAULT_JSON_EXPORT_DIR - self.output_dir_sqlite: Path = DEFAULT_SQLITE_EXPORT_DIR + self.input_dir: Path = DEFAULT_JSON_DIR + self.output_dir_sqlite: Path = DEFAULT_SQLITE_DIR interactive_mode_config = ScribeDataConfig() diff --git a/src/scribe_data/cli/interactive/execute.py b/src/scribe_data/cli/interactive/execute.py index 3ba7c7586..65135bd72 100644 --- a/src/scribe_data/cli/interactive/execute.py +++ b/src/scribe_data/cli/interactive/execute.py @@ -4,10 +4,8 @@ """ import logging -from pathlib import Path import questionary -from prompt_toolkit import prompt from rich import print as rprint from rich.console import Console from rich.logging import RichHandler @@ -24,7 +22,6 @@ prompt_for_languages, ) from scribe_data.cli.total.wrapper import total_wrapper -from scribe_data.utils import DEFAULT_WIKIDATA_DUMP_EXPORT_DIR from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump # MARK: Logging @@ -126,17 +123,8 @@ def request_total_lexeme_loop() -> None: break elif choice == "run_all": - if wikidata_dump_path := prompt( - f"Enter Wikidata lexeme dump path (default: {str(DEFAULT_WIKIDATA_DUMP_EXPORT_DIR)}): " - ): - wikidata_dump_path = Path(wikidata_dump_path) - - else: - wikidata_dump_path = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR - parse_wd_lexeme_dump( languages=interactive_mode_config.selected_languages, - wikidata_dump_path=wikidata_dump_path, wikidata_dump_type=["total"], interactive_mode=True, ) @@ -173,7 +161,6 @@ def display_summary() -> None: ", ".join(interactive_mode_config.selected_data_types) or "None", ) table.add_row("Output Type", interactive_mode_config.output_type) - table.add_row("Output Directory", str(interactive_mode_config.output_dir)) table.add_row("Overwrite", "Yes" if interactive_mode_config.overwrite else "No") console.print("\n") diff --git a/src/scribe_data/cli/interactive/prompt.py b/src/scribe_data/cli/interactive/prompt.py index fdda147a9..a793fe8fd 100644 --- a/src/scribe_data/cli/interactive/prompt.py +++ b/src/scribe_data/cli/interactive/prompt.py @@ -10,7 +10,7 @@ from rich import print as rprint from scribe_data.cli.interactive.config import interactive_mode_config -from scribe_data.utils import DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, resolve_lang_iso +from scribe_data.utils import DEFAULT_WIKTIONARY_DUMP_DIR, resolve_lang_iso # MARK: Word Completion @@ -128,7 +128,7 @@ def _wiktionary_dump_search_dirs() -> list[Path]: Duplicate paths are omitted while preserving the following search order: 1. The provided ``location`` directory. - 2. The default export directory (:data:`~scribe_data.utils.DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR`). + 2. The default export directory (:data:`~scribe_data.utils.DEFAULT_WIKTIONARY_DUMP_DIR`). 3. The default export directory under every ancestor of the current working directory. 4. The current working directory itself. @@ -141,8 +141,8 @@ def _wiktionary_dump_search_dirs() -> list[Path]: A deduplicated list of existing directories to search. """ candidates = [ - DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, - *(parent / DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR for parent in Path.cwd().parents), + DEFAULT_WIKTIONARY_DUMP_DIR, + *(parent / DEFAULT_WIKTIONARY_DUMP_DIR for parent in Path.cwd().parents), Path.cwd(), ] resolved_paths = [path.expanduser().resolve() for path in candidates] diff --git a/src/scribe_data/cli/interactive/run.py b/src/scribe_data/cli/interactive/run.py index 9d37fb0db..10036c21d 100644 --- a/src/scribe_data/cli/interactive/run.py +++ b/src/scribe_data/cli/interactive/run.py @@ -25,11 +25,7 @@ prompt_for_languages, resolve_wiktionary_dump_path, ) -from scribe_data.utils import ( - DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, - DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, -) +from scribe_data.utils import DEFAULT_WIKTIONARY_DUMP_DIR from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump # MARK: Configure @@ -162,20 +158,10 @@ def run_interactive_mode(operation: str | None = None) -> None: configure_settings() elif choice == "run_all": - if wikidata_dump_path := prompt( - f"Enter Wikidata lexeme dump path (default: {str(DEFAULT_WIKIDATA_DUMP_EXPORT_DIR)}): " - ): - wikidata_dump_path = Path(wikidata_dump_path) - - else: - wikidata_dump_path = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR - parse_wd_lexeme_dump( languages=interactive_mode_config.selected_languages, data_types=interactive_mode_config.selected_data_types, wikidata_dump_type=["form"], - output_dir=interactive_mode_config.output_dir, - wikidata_dump_path=wikidata_dump_path, overwrite_all=interactive_mode_config.overwrite, interactive_mode=True, ) @@ -199,12 +185,6 @@ def run_interactive_mode(operation: str | None = None) -> None: ) interactive_mode_config.input_dir = Path(user_input_dir) - user_output_dir = prompt( - f"Enter output directory (default: {interactive_mode_config.output_dir_sqlite}): ", - default=str(interactive_mode_config.output_dir_sqlite), - ) - interactive_mode_config.output_dir_sqlite = Path(user_output_dir) - identifier_case = prompt( "Enter identifier case (default: camel): ", default="camel", @@ -246,30 +226,19 @@ def run_interactive_mode(operation: str | None = None) -> None: f"[bold red]Error: {wiktionary_dump_language} is not a valid language.[/bold red]" ) - dump_location = prompt( - "Enter Wiktionary dump directory or file path " - f"(default: {DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR}): ", - default=str(DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR), - ) wiktionary_dump_path = resolve_wiktionary_dump_path( wiktionary_dump_language, - dump_location, + DEFAULT_WIKTIONARY_DUMP_DIR, ) if not wiktionary_dump_path: rprint( f"[bold red]No {wiktionary_dump_language} Wiktionary dump found at " - f"{dump_location}.[/bold red]" + f"{DEFAULT_WIKTIONARY_DUMP_DIR}.[/bold red]" ) break prompt_for_languages() - translations_output_dir = prompt( - "Enter output directory " - f"(default: {DEFAULT_WIKTIONARY_JSON_EXPORT_DIR}): ", - default=str(DEFAULT_WIKTIONARY_JSON_EXPORT_DIR), - ) - overwrite_str = prompt( "Overwrite existing files? (default: False): ", default="False", @@ -279,7 +248,6 @@ def run_interactive_mode(operation: str | None = None) -> None: parse_wiktionary_translations( target_languages=interactive_mode_config.selected_languages, wiktionary_dump_path=Path(wiktionary_dump_path), - output_dir=Path(translations_output_dir), overwrite=overwrite_bool, ) diff --git a/src/scribe_data/cli/total/wrapper.py b/src/scribe_data/cli/total/wrapper.py index 13f0dd34a..89bfddbe1 100644 --- a/src/scribe_data/cli/total/wrapper.py +++ b/src/scribe_data/cli/total/wrapper.py @@ -7,7 +7,6 @@ from scribe_data.cli.total.print_values import print_total_lexemes from scribe_data.cli.total.query import query_total_lexemes -from scribe_data.utils import DEFAULT_WIKIDATA_DUMP_EXPORT_DIR from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump # MARK: Wrapper @@ -48,7 +47,6 @@ def total_wrapper( languages=languages or ["all"], data_types=data_types or ["all"], wikidata_dump_type=["total"], - wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, ) return @@ -58,7 +56,6 @@ def total_wrapper( languages=languages or ["all"], data_types=data_types or ["all"], wikidata_dump_type=["total"], - wikidata_dump_path=wikidata_dump, ) return diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index 2c3874cc1..faa8d389c 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -20,7 +20,7 @@ from tqdm.auto import tqdm from scribe_data.unicode.unicode_utils import get_emoji_codes_to_ignore -from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR, get_language_iso +from scribe_data.utils import DEFAULT_JSON_DIR, get_language_iso emoji_codes_to_ignore = get_emoji_codes_to_ignore() @@ -149,7 +149,7 @@ def gen_emoji_lexicon( ) # Check nouns files for plurals and update their data with the emojis for their singular forms. - language_nouns_path = DEFAULT_JSON_EXPORT_DIR / f"{language}" / "nouns.json" + language_nouns_path = DEFAULT_JSON_DIR / f"{language}" / "nouns.json" if not language_nouns_path.is_file(): print( "\nNote: Getting a language's nouns before emoji keywords allows for plurals to be linked to the emojis for their singulars.\n" diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 61a12a856..ae687f20d 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -21,16 +21,15 @@ PROJECT_ROOT = "Scribe-Data" -DEFAULT_JSON_EXPORT_DIR = Path("scribe_data_json_export") -DEFAULT_FILTERED_JSON_EXPORT_DIR = Path("scribe_data_filtered_json_export") -DEFAULT_CSV_EXPORT_DIR = Path("scribe_data_csv_export") -DEFAULT_TSV_EXPORT_DIR = Path("scribe_data_tsv_export") -DEFAULT_SQLITE_EXPORT_DIR = Path("scribe_data_sqlite_export") - -DEFAULT_WIKIDATA_DUMP_EXPORT_DIR = Path("scribe_data_wikidata_dumps_export") - -DEFAULT_WIKTIONARY_JSON_EXPORT_DIR = Path("scribe_data_wiktionary_json_export") -DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR = Path("scribe_data_wiktionary_dumps_export") +DEFAULT_JSON_DIR = Path("scribe_data_json") +DEFAULT_FILTERED_JSON_DIR = Path("scribe_data_filtered_json") +DEFAULT_CSV_DIR = Path("scribe_data_csv") +DEFAULT_TSV_DIR = Path("scribe_data_tsv") +DEFAULT_SQLITE_DIR = Path("scribe_data_sqlite") + +DEFAULT_WIKIDATA_DUMP_DIR = Path("scribe_data_wikidata_dumps") +DEFAULT_WIKTIONARY_DUMP_DIR = Path("scribe_data_wiktionary_dumps") +DEFAULT_WIKTIONARY_JSON_DIR = Path("scribe_data_wiktionary_json") DEFAULT_CONTRACTS_EXPORT_DIR = Path("scribe_data_contracts") DEFAULT_DATA_CONTRACTS_DIR = Path(__file__).parent / "resources" / "data_contracts" diff --git a/src/scribe_data/wikidata/parse_dump.py b/src/scribe_data/wikidata/parse_dump.py index 521fe3839..d649e8069 100644 --- a/src/scribe_data/wikidata/parse_dump.py +++ b/src/scribe_data/wikidata/parse_dump.py @@ -16,7 +16,8 @@ from scribe_data.check.check_query_forms import return_correct_form_label from scribe_data.utils import ( - DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, + DEFAULT_JSON_DIR, + DEFAULT_WIKIDATA_DUMP_DIR, check_index_exists, check_qid_is_language, data_type_metadata, @@ -608,7 +609,6 @@ def parse_dump( parse_type: list[str] = [""], data_types: list[str] | None = None, file_path: Path = Path("latest-lexemes.json.bz2"), - output_dir: Path | None = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, overwrite_all: bool = False, ) -> None: """ @@ -631,9 +631,6 @@ def parse_dump( file_path : str, default="latest-lexemes.json.bz2" Path to the lexeme dump file. - output_dir : str, optional - Directory to save output files. If None, uses DEFAULT_WIKIDATA_DUMP_EXPORT_DIR. - overwrite_all : bool, default=False If True, automatically overwrite existing files without prompting. @@ -647,8 +644,7 @@ def parse_dump( will be skipped. """ # Prepare environment - Use default if output_dir is None. - output_dir = output_dir or DEFAULT_WIKIDATA_DUMP_EXPORT_DIR - Path(output_dir).mkdir(parents=True, exist_ok=True) + Path(DEFAULT_JSON_DIR).mkdir(parents=True, exist_ok=True) # Convert single strings to lists. parse_type = parse_type or [] @@ -678,11 +674,16 @@ def parse_dump( # Create appropriate path based on whether it's a sub-language. if main_lang: index_path = ( - Path(output_dir) / main_lang / lang / f"{data_type}.json" + DEFAULT_WIKIDATA_DUMP_DIR + / main_lang + / lang + / f"{data_type}.json" ) else: - index_path = Path(output_dir) / lang / f"{data_type}.json" + index_path = ( + DEFAULT_WIKIDATA_DUMP_DIR / lang / f"{data_type}.json" + ) if not check_index_exists(index_path, overwrite_all): needs_processing = True @@ -719,7 +720,7 @@ def parse_dump( if "form" in parse_type: # For each data_type, we create a separate file, e.g. nouns.json. for dt in data_types: - index_path = Path(output_dir) / f"{dt}.json" + index_path = DEFAULT_WIKIDATA_DUMP_DIR / f"{dt}.json" iso_codes = set() for word_data in processor.forms_index.values(): iso_codes.update(word_data.keys()) diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index 0cf0068cf..fc66dfb48 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -15,7 +15,7 @@ from tqdm.auto import tqdm from scribe_data.utils import ( - DEFAULT_JSON_EXPORT_DIR, + DEFAULT_JSON_DIR, WIKIDATA_QUERIES_ALL_DATA_DIR, format_sublanguage_name, language_metadata, @@ -167,7 +167,7 @@ def query_data( if str(output_dir).startswith("./") else output_dir ) - export_dir = (updated_path or DEFAULT_JSON_EXPORT_DIR) / lang.replace(" ", "_") + export_dir = (updated_path or DEFAULT_JSON_DIR) / lang.replace(" ", "_") export_dir.mkdir(parents=True, exist_ok=True) file_name = f"{target_type}.json" @@ -281,7 +281,7 @@ def query_data( # Call the formatting script. execute_formatting_script( - output_dir=output_dir or DEFAULT_JSON_EXPORT_DIR, + output_dir=output_dir or DEFAULT_JSON_DIR, language=lang, data_type=target_type, ) diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 7195bf160..e31b04c02 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -11,11 +11,7 @@ from scribe_data.cli.download.wikidata_lexeme_dump import ( wd_lexeme_dump_download_wrapper, ) -from scribe_data.utils import ( - DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - data_type_metadata, - language_metadata, -) +from scribe_data.utils import data_type_metadata, language_metadata from scribe_data.wikidata.parse_dump import parse_dump sparql = SPARQLWrapper("https://query.wikidata.org/sparql") @@ -27,8 +23,6 @@ def parse_wd_lexeme_dump( languages: str | list[str] | None, data_types: list[str] | None = None, wikidata_dump_type: str | list[str] | None = None, - output_dir: Path | None = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, - wikidata_dump_path: Path | None = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, overwrite_all: bool = False, interactive_mode: bool = False, ) -> None: @@ -46,12 +40,6 @@ def parse_wd_lexeme_dump( wikidata_dump_type : List[str] The type(s) of Wikidata lexeme dump to parse (e.g. ["total", "form"]). - output_dir : str, optional - The directory to save the parsed JSON data. If None, uses default directory. - - wikidata_dump_path : Path - The local Wikidata lexeme dump directory that should be used to get data. - overwrite_all : bool, default=False If True, automatically overwrite existing files without prompting. @@ -117,7 +105,6 @@ def parse_wd_lexeme_dump( parse_type=normalized_dump_type, data_types=data_types, file_path=file_path, - output_dir=output_dir, overwrite_all=overwrite_all, ) diff --git a/src/scribe_data/wiktionary/parse_translations.py b/src/scribe_data/wiktionary/parse_translations.py index cdcfce311..6a122b475 100644 --- a/src/scribe_data/wiktionary/parse_translations.py +++ b/src/scribe_data/wiktionary/parse_translations.py @@ -18,8 +18,8 @@ from tqdm import tqdm from scribe_data.utils import ( - DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, - DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, + DEFAULT_WIKTIONARY_DUMP_DIR, + DEFAULT_WIKTIONARY_JSON_DIR, check_index_exists, get_language_from_iso, language_metadata, @@ -1164,7 +1164,6 @@ def _filtered_iterator(): def parse_wiktionary_translations( target_languages: str | list[str] | None = None, wiktionary_dump_path: str | Path | None = None, - output_dir: Path | None = DEFAULT_WIKTIONARY_JSON_EXPORT_DIR, overwrite: bool = False, ) -> None: """ @@ -1179,14 +1178,10 @@ def parse_wiktionary_translations( wiktionary_dump_path : str or Path, optional Path to a ``*wiktionary-*-pages-articles.xml.bz2`` dump file. - output_dir : Path, optional, default=DEFAULT_WIKTIONARY_JSON_EXPORT_DIR - Directory where JSON files are saved. - overwrite : bool, default ``False`` Whether to overwrite existing output files. """ - output_dir = output_dir or DEFAULT_WIKTIONARY_JSON_EXPORT_DIR - Path(output_dir).mkdir(parents=True, exist_ok=True) + Path(DEFAULT_WIKTIONARY_JSON_DIR).mkdir(parents=True, exist_ok=True) target_isos: list[str] = [] if not target_languages or target_languages == "all" or target_languages == ["all"]: @@ -1216,14 +1211,14 @@ def parse_wiktionary_translations( dump_path, source_iso = _resolve_dump_path( wiktionary_dump_path=wiktionary_dump_path, - output_dir=DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, + output_dir=DEFAULT_WIKTIONARY_DUMP_DIR, ) if not dump_path: return source_lang_name = get_language_from_iso(source_iso) out_subdir = _get_output_subdir(source_lang_name, language_metadata) - base_out_path = output_dir / out_subdir + base_out_path = DEFAULT_WIKTIONARY_JSON_DIR / out_subdir base_out_path.mkdir(parents=True, exist_ok=True) data_by_lang = parse_xml_dump( diff --git a/tests/cli/contracts/test_cli_contracts_check.py b/tests/cli/contracts/test_cli_contracts_check.py index a4655bcd8..5cc8cf175 100644 --- a/tests/cli/contracts/test_cli_contracts_check.py +++ b/tests/cli/contracts/test_cli_contracts_check.py @@ -104,7 +104,7 @@ def test_cli_contracts_check_default_dir( mock_print.assert_called_once_with({}) -@patch("scribe_data.cli.contracts.check.Path") +@patch("pathlib.Path") def test_cli_contracts_check_nonexistent_dir(mock_path: MagicMock) -> None: """ Test check_contracts with a nonexistent directory. @@ -112,7 +112,7 @@ def test_cli_contracts_check_nonexistent_dir(mock_path: MagicMock) -> None: mock_path.return_value.exists.return_value = False with patch("builtins.print") as mock_print: - check_contract_data_print_missing(contracts_dir=DEFAULT_DATA_CONTRACTS_DIR) + check_contract_data_print_missing(contracts_dir="nonexistent_dir") mock_print.assert_called_once() assert "Error: Directory" in mock_print.call_args[0][0] @@ -135,14 +135,20 @@ def test_cli_contracts_check_data_completeness_json_error( mock_filter_metadata.return_value = mock_contract_metadata # Execute with patched open to cause JSON error. - with patch("pathlib.Path.exists") as mock_exists: - mock_exists.return_value = True # make paths exist - with patch("builtins.open", side_effect=json.JSONDecodeError("Error", "", 0)): - with patch("builtins.print") as mock_print: - check_contract_data_completeness(mock_export_dir) - - mock_print.assert_called() - assert "Error reading" in mock_print.call_args[0][0] + with patch("pathlib.Path.exists") as mock_exists_1: + mock_exists_1.return_value = True + with patch("pathlib.Path.exists") as mock_exists_2: + mock_exists_2.return_value = True # make paths exist + with patch( + "builtins.open", side_effect=json.JSONDecodeError("Error", "", 0) + ): + with patch("builtins.print") as mock_print: + check_contract_data_completeness( + contracts_dir=DEFAULT_DATA_CONTRACTS_DIR + ) + + mock_print.assert_called() + assert "Error reading" in mock_print.call_args[0][0] def test_cli_contracts_print_missing_forms_none() -> None: diff --git a/tests/cli/contracts/test_cli_contracts_export.py b/tests/cli/contracts/test_cli_contracts_export.py index 95e223caf..fa5af57f5 100644 --- a/tests/cli/contracts/test_cli_contracts_export.py +++ b/tests/cli/contracts/test_cli_contracts_export.py @@ -8,6 +8,7 @@ from unittest.mock import patch import pytest +from conftest import cleanup_default_directories from scribe_data.cli.contracts.export import export_contracts from scribe_data.utils import DEFAULT_CONTRACTS_EXPORT_DIR @@ -25,11 +26,9 @@ def contracts_source(tmp_path: Path) -> Path: return source -def test_cli_contracts_export_fresh_export( - tmp_path: Path, contracts_source: Path -) -> None: +def test_cli_contracts_export_new_dir(tmp_path: Path, contracts_source: Path) -> None: """ - Test fresh export when no existing contracts folder. + Test export when no existing contracts folder. """ with patch( "scribe_data.cli.contracts.export.Path.__truediv__", @@ -48,6 +47,7 @@ def test_cli_contracts_export_success_message( """ Test success message after fresh export. """ + cleanup_default_directories() with patch( "scribe_data.cli.contracts.export.Path.__truediv__", return_value=contracts_source, @@ -121,6 +121,7 @@ def test_cli_contracts_export_files_content( """ Test that exported files have correct content. """ + cleanup_default_directories() with patch( "scribe_data.cli.contracts.export.Path.__truediv__", return_value=contracts_source, diff --git a/tests/cli/convert/test_cli_convert_to_csv_or_tsv.py b/tests/cli/convert/test_cli_convert_to_csv_or_tsv.py index acce141c6..831b9e01a 100644 --- a/tests/cli/convert/test_cli_convert_to_csv_or_tsv.py +++ b/tests/cli/convert/test_cli_convert_to_csv_or_tsv.py @@ -8,7 +8,7 @@ import pytest from scribe_data.cli.convert.to_csv_or_tsv import convert_to_csv_or_tsv -from scribe_data.utils import DEFAULT_CSV_EXPORT_DIR, DEFAULT_TSV_EXPORT_DIR +from scribe_data.utils import DEFAULT_CSV_DIR, DEFAULT_TSV_DIR # MARK: CSV or TSV @@ -50,7 +50,7 @@ def test_cli_convert_to_csv_or_tsv_standard_dict_to_csv(self) -> None: overwrite=True, ) - output_file = DEFAULT_CSV_EXPORT_DIR / "English" / "prepositions.csv" + output_file = DEFAULT_CSV_DIR / "English" / "prepositions.csv" actual_content = output_file.read_text(encoding="utf-8") assert actual_content == expected_csv_output @@ -69,7 +69,7 @@ def test_cli_convert_to_csv_or_tsv_standard_dict_to_tsv(self) -> None: overwrite=True, ) - output_file = DEFAULT_TSV_EXPORT_DIR / "English" / "prepositions.tsv" + output_file = DEFAULT_TSV_DIR / "English" / "prepositions.tsv" actual_content = output_file.read_text(encoding="utf-8") assert actual_content == expected_tsv_output @@ -90,7 +90,7 @@ def test_cli_convert_to_csv_or_tsv_nested_dict_to_csv(self) -> None: overwrite=True, ) - output_file = DEFAULT_CSV_EXPORT_DIR / "English" / "nouns.csv" + output_file = DEFAULT_CSV_DIR / "English" / "nouns.csv" actual_content = output_file.read_text(encoding="utf-8") assert actual_content == expected_csv_output @@ -111,7 +111,7 @@ def test_cli_convert_to_csv_or_tsv_nested_dict_to_tsv(self) -> None: overwrite=True, ) - output_file = DEFAULT_TSV_EXPORT_DIR / "English" / "nouns.tsv" + output_file = DEFAULT_TSV_DIR / "English" / "nouns.tsv" actual_content = output_file.read_text(encoding="utf-8") assert actual_content == expected_tsv_output @@ -130,7 +130,7 @@ def test_cli_convert_to_csv_or_tsv_list_of_dicts_to_csv(self) -> None: overwrite=True, ) - output_file = DEFAULT_CSV_EXPORT_DIR / "English" / "emoji-keywords.csv" + output_file = DEFAULT_CSV_DIR / "English" / "emoji-keywords.csv" actual_content = output_file.read_text(encoding="utf-8") assert actual_content == expected_csv_output @@ -151,6 +151,6 @@ def test_cli_convert_to_csv_or_tsv_list_of_dicts_to_tsv(self) -> None: overwrite=True, ) - output_file = DEFAULT_TSV_EXPORT_DIR / "English" / "emoji-keywords.tsv" + output_file = DEFAULT_TSV_DIR / "English" / "emoji-keywords.tsv" actual_content = output_file.read_text(encoding="utf-8") assert actual_content == expected_tsv_output diff --git a/tests/cli/convert/test_cli_convert_to_json.py b/tests/cli/convert/test_cli_convert_to_json.py index 0c3b1449f..ec0d67ea5 100644 --- a/tests/cli/convert/test_cli_convert_to_json.py +++ b/tests/cli/convert/test_cli_convert_to_json.py @@ -12,7 +12,7 @@ import pytest from scribe_data.cli.convert.to_json import convert_to_json -from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR +from scribe_data.utils import DEFAULT_JSON_DIR # MARK: JSON @@ -22,7 +22,7 @@ class TestCLIConvertToJSON(unittest.TestCase): def _setup_fixtures(self, tmp_path): self.tmp_path = tmp_path - @patch("scribe_data.cli.convert.Path", autospec=True) + @patch("pathlib.Path", autospec=True) def test_cli_convert_to_json_empty_language(self, mock_path: MagicMock) -> None: csv_data = "key,value\na,1\nb,2" mock_file = StringIO(csv_data) @@ -43,7 +43,7 @@ def test_cli_convert_to_json_empty_language(self, mock_path: MagicMock) -> None: ) self.assertIn("Language '' is not recognized.", str(context.exception)) - @patch("scribe_data.cli.convert.Path", autospec=True) + @patch("pathlib.Path", autospec=True) def test_cli_convert_to_json_supported_file_extension_csv( self, mock_path_class: MagicMock ) -> None: @@ -62,7 +62,7 @@ def test_cli_convert_to_json_supported_file_extension_csv( overwrite=True, ) - @patch("scribe_data.cli.convert.Path", autospec=True) + @patch("pathlib.Path", autospec=True) def test_cli_convert_to_json_supported_file_extension_tsv( self, mock_path_class: MagicMock ) -> None: @@ -115,7 +115,7 @@ def test_cli_convert_to_json_standard_csv(self) -> None: overwrite=True, ) - output_file = DEFAULT_JSON_EXPORT_DIR / "English" / "nouns.json" + output_file = DEFAULT_JSON_DIR / "English" / "nouns.json" with open(output_file, "r", encoding="utf-8") as f: actual_content = json.load(f) @@ -140,7 +140,7 @@ def test_cli_convert_to_json_with_multiple_keys(self) -> None: overwrite=True, ) - output_file = DEFAULT_JSON_EXPORT_DIR / "English" / "nouns.json" + output_file = DEFAULT_JSON_DIR / "English" / "nouns.json" with open(output_file, "r", encoding="utf-8") as f: actual_content = json.load(f) @@ -164,7 +164,7 @@ def test_cli_convert_to_json_with_complex_structure(self) -> None: overwrite=True, ) - output_file = DEFAULT_JSON_EXPORT_DIR / "English" / "nouns.json" + output_file = DEFAULT_JSON_DIR / "English" / "nouns.json" with open(output_file, "r", encoding="utf-8") as f: actual_content = json.load(f) diff --git a/tests/cli/convert/test_cli_convert_to_sqlite.py b/tests/cli/convert/test_cli_convert_to_sqlite.py index 03f5c0612..4a68e210a 100644 --- a/tests/cli/convert/test_cli_convert_to_sqlite.py +++ b/tests/cli/convert/test_cli_convert_to_sqlite.py @@ -18,7 +18,7 @@ translations_to_sqlite, wiktionary_translations_to_sqlite, ) -from scribe_data.utils import DEFAULT_SQLITE_EXPORT_DIR +from scribe_data.utils import DEFAULT_SQLITE_DIR @pytest.fixture @@ -112,7 +112,7 @@ def translations_setup(tmp_path: Path) -> dict[str, Any]: """ lang_data_type_dict = {"english": ["translations"]} current_languages = ["english", "german", "french"] - expected_db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" + expected_db_path = DEFAULT_SQLITE_DIR / "TranslationData.sqlite" return { "lang_data_type_dict": lang_data_type_dict, @@ -139,7 +139,7 @@ def test_cli_convert_translations_to_sqlite( ) # Verify database creation. - db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" + db_path = DEFAULT_SQLITE_DIR / "TranslationData.sqlite" assert db_path.exists() # Check database contents. @@ -324,11 +324,11 @@ def test_cli_convert_convert_to_sqlite_translations_and_nouns(tmp_path: Path) -> ) # Assert TranslationData.sqlite exists. - translation_db = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" + translation_db = DEFAULT_SQLITE_DIR / "TranslationData.sqlite" assert translation_db.exists() # Assert ENLanguageData.sqlite (for other tables) exists. - noun_db = DEFAULT_SQLITE_EXPORT_DIR / "ENLanguageData.sqlite" + noun_db = DEFAULT_SQLITE_DIR / "ENLanguageData.sqlite" assert noun_db.exists() # Check nouns table created and has data. @@ -434,7 +434,7 @@ def test_cli_convert_wiktionary_translations_to_sqlite_basic(tmp_path): ) # Verify database was created. - db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" + db_path = DEFAULT_SQLITE_DIR / "TranslationData.sqlite" assert db_path.exists() conn = sqlite3.connect(db_path) @@ -493,7 +493,7 @@ def test_cli_convert_wiktionary_translations_to_sqlite_camel_case(tmp_path): overwrite=True, ) - db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" + db_path = DEFAULT_SQLITE_DIR / "TranslationData.sqlite" conn = sqlite3.connect(db_path) cursor = conn.cursor() @@ -529,7 +529,7 @@ def test_wiktionary_translations_to_sqlite_no_translation_files(tmp_path): wiktionary_translations_to_sqlite(language="english", input_file=str(input_dir)) # No TranslationData.sqlite should be created. - db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" + db_path = DEFAULT_SQLITE_DIR / "TranslationData.sqlite" assert not db_path.exists() @@ -561,7 +561,7 @@ def test_wiktionary_translations_to_sqlite_multiple_files(tmp_path): overwrite=True, ) - db_path = DEFAULT_SQLITE_EXPORT_DIR / "TranslationData.sqlite" + db_path = DEFAULT_SQLITE_DIR / "TranslationData.sqlite" conn = sqlite3.connect(db_path) cursor = conn.cursor() diff --git a/tests/cli/convert/test_cli_convert_wrapper.py b/tests/cli/convert/test_cli_convert_wrapper.py index 576f917e6..352ed2585 100644 --- a/tests/cli/convert/test_cli_convert_wrapper.py +++ b/tests/cli/convert/test_cli_convert_wrapper.py @@ -19,8 +19,8 @@ class TestCLIConvertWrapper(unittest.TestCase): def _setup_fixtures(self, tmp_path): self.tmp_path = tmp_path - @patch("scribe_data.cli.convert.Path", autospec=True) - @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) + @patch("pathlib.Path", autospec=True) + @patch("scribe_data.cli.convert.to_sqlite.convert_to_sqlite", autospec=True) @patch("shutil.copy") def test_cli_convert_to_sqlite( self, @@ -48,8 +48,8 @@ def test_cli_convert_to_sqlite( overwrite=True, ) - @patch("scribe_data.cli.convert.Path", autospec=True) - @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) + @patch("pathlib.Path", autospec=True) + @patch("scribe_data.cli.convert.to_sqlite.convert_to_sqlite", autospec=True) def test_cli_convert_to_sqlite_no_output_dir( self, mock_data_to_sqlite: MagicMock, mock_path: MagicMock ) -> None: @@ -80,7 +80,7 @@ def test_cli_convert_to_sqlite_no_output_dir( overwrite=True, ) - @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) + @patch("scribe_data.cli.convert.to_sqlite.convert_to_sqlite", autospec=True) def test_cli_convert_wrapper_german_wiktionary_translations_sqlite( self, mock_data_to_sqlite: MagicMock ) -> None: @@ -103,10 +103,10 @@ def test_cli_convert_wrapper_german_wiktionary_translations_sqlite( ) @patch( - "scribe_data.cli.convert.DEFAULT_WIKTIONARY_JSON_EXPORT_DIR", + "scribe_data.utils.DEFAULT_WIKTIONARY_JSON_DIR", new=Path("/mock_wiktionary_dir"), ) - @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) + @patch("scribe_data.cli.convert.to_sqlite.convert_to_sqlite", autospec=True) def test_cli_convert_wrapper_wiktionary_no_input_path_uses_wiktionary_default( self, mock_data_to_sqlite: MagicMock ) -> None: diff --git a/tests/cli/download/test_cli_download_wikidata_lexeme_dump.py b/tests/cli/download/test_cli_download_wikidata_lexeme_dump.py index 8179270c1..0cea2ae05 100644 --- a/tests/cli/download/test_cli_download_wikidata_lexeme_dump.py +++ b/tests/cli/download/test_cli_download_wikidata_lexeme_dump.py @@ -90,12 +90,10 @@ def test_cli_download_wd_lexeme_dump_by_date( ) @patch("requests.get") - @patch( - "scribe_data.cli.download.check_lexeme_dump_prompt_download", return_value=False - ) - @patch("scribe_data.cli.download.open", new_callable=mock_open) - @patch("scribe_data.cli.download.tqdm") - @patch("scribe_data.cli.download.os.makedirs") + @patch("scribe_data.utils.check_lexeme_dump_prompt_download", return_value=False) + @patch("scribe_data.cli.download.wikidata_lexeme_dump.open", new_callable=mock_open) + @patch("tqdm.tqdm") + @patch("os.makedirs") @patch("questionary.confirm") def test_cli_download_wd_lexeme_dump_wrapper_latest( self, @@ -116,14 +114,12 @@ def test_cli_download_wd_lexeme_dump_wrapper_latest( mock_get.return_value.headers = {"content-length": "100"} mock_get.return_value.iter_content = lambda chunk_size: [b"data"] * 10 - # Mock DEFAULT_WIKIDATA_DUMP_EXPORT_DIR. + # Mock DEFAULT_WIKIDATA_DUMP_DIR. with patch( - "scribe_data.utils.DEFAULT_WIKIDATA_DUMP_EXPORT_DIR", + "scribe_data.utils.DEFAULT_WIKIDATA_DUMP_DIR", new=Path("test_export_dir"), ): - download_path = wd_lexeme_dump_download_wrapper( - output_dir=Path("test_export_dir") - ) + download_path = wd_lexeme_dump_download_wrapper() self.assertIsNotNone(download_path, "Download path should not be None") self.assertIn("latest-lexemes.json.bz2", str(download_path)) mock_makedirs.assert_called_with(Path("test_export_dir"), exist_ok=True) @@ -269,7 +265,9 @@ def test_cli_download_wd_lexeme_dump_download_wrapper_default_flag(self) -> None """ Test wrapper function with default flag set to True. """ - with patch("scribe_data.cli.download.download_wd_lexeme_dump") as mock_download: + with patch( + "scribe_data.cli.download.wikidata_lexeme_dump.download_wd_lexeme_dump" + ) as mock_download: mock_download.return_value = None result = wd_lexeme_dump_download_wrapper(default=True) diff --git a/tests/cli/interactive/test_cli_interactive_config.py b/tests/cli/interactive/test_cli_interactive_config.py index 5997e0a81..a9953d18d 100644 --- a/tests/cli/interactive/test_cli_interactive_config.py +++ b/tests/cli/interactive/test_cli_interactive_config.py @@ -52,8 +52,10 @@ def test_cli_interactive_configure_settings_all_languages( ) mock_prompt.side_effect = lambda *args, **kwargs: next(responses) - with patch("scribe_data.cli.interactive.config", self.config): - with patch("scribe_data.cli.interactive.display_summary"): + with patch( + "scribe_data.cli.interactive.config.interactive_mode_config", self.config + ): + with patch("scribe_data.cli.interactive.execute.display_summary"): configure_settings() self.assertEqual(self.config.selected_languages, self.config.languages) @@ -81,8 +83,10 @@ def test_cli_interactive_configure_settings_specific_languages( ) mock_prompt.side_effect = lambda *args, **kwargs: next(responses) - with patch("scribe_data.cli.interactive.config", self.config): - with patch("scribe_data.cli.interactive.display_summary"): + with patch( + "scribe_data.cli.interactive.config.interactive_mode_config", self.config + ): + with patch("scribe_data.cli.interactive.execute.display_summary"): configure_settings() self.assertEqual(self.config.selected_languages, ["english", "spanish"]) diff --git a/tests/cli/interactive/test_cli_interactive_execute.py b/tests/cli/interactive/test_cli_interactive_execute.py index 63e7ae146..38fccee11 100644 --- a/tests/cli/interactive/test_cli_interactive_execute.py +++ b/tests/cli/interactive/test_cli_interactive_execute.py @@ -40,7 +40,9 @@ def test_cli_interactive_execute_request( mock_progress = MagicMock() mock_tqdm.return_value.__enter__.return_value = mock_progress - with patch("scribe_data.cli.interactive.config", self.config): + with patch( + "scribe_data.cli.interactive.config.interactive_mode_config", self.config + ): execute_request() mock_get_data.assert_called_once_with( @@ -61,6 +63,8 @@ def test_cli_interactive_display_summary(self, mock_print: MagicMock) -> None: self.config.selected_data_types = ["nouns"] self.config.output_type = "json" - with patch("scribe_data.cli.interactive.config", self.config): + with patch( + "scribe_data.cli.interactive.config.interactive_mode_config", self.config + ): display_summary() mock_print.assert_called() diff --git a/tests/cli/interactive/test_cli_interactive_prompt.py b/tests/cli/interactive/test_cli_interactive_prompt.py index bd1697f2e..13240979c 100644 --- a/tests/cli/interactive/test_cli_interactive_prompt.py +++ b/tests/cli/interactive/test_cli_interactive_prompt.py @@ -43,9 +43,11 @@ def test_cli_interactive_request_total_lexeme( "nouns", # first call for data types ] - with patch("scribe_data.cli.interactive.config", self.config): + with patch( + "scribe_data.cli.interactive.config.interactive_mode_config", self.config + ): with patch( - "scribe_data.cli.interactive.list_all_languages", + "scribe_data.utils.list_all_languages", return_value=["english", "french"], ): prompt_for_languages() diff --git a/tests/cli/test_cli_get.py b/tests/cli/test_cli_get.py index 47f5c542f..d543b5d97 100644 --- a/tests/cli/test_cli_get.py +++ b/tests/cli/test_cli_get.py @@ -14,10 +14,10 @@ from scribe_data.cli.get import get_data from scribe_data.utils import ( - DEFAULT_CSV_EXPORT_DIR, - DEFAULT_JSON_EXPORT_DIR, - DEFAULT_SQLITE_EXPORT_DIR, - DEFAULT_TSV_EXPORT_DIR, + DEFAULT_CSV_DIR, + DEFAULT_JSON_DIR, + DEFAULT_SQLITE_DIR, + DEFAULT_TSV_DIR, ) @@ -497,10 +497,10 @@ def test_cli_get_default_output_directory_selection( """ mock_check_index.return_value = False test_cases = [ - ("csv", DEFAULT_CSV_EXPORT_DIR), - ("json", DEFAULT_JSON_EXPORT_DIR), - ("sqlite", DEFAULT_SQLITE_EXPORT_DIR), - ("tsv", DEFAULT_TSV_EXPORT_DIR), + ("csv", DEFAULT_CSV_DIR), + ("json", DEFAULT_JSON_DIR), + ("sqlite", DEFAULT_SQLITE_DIR), + ("tsv", DEFAULT_TSV_DIR), ] for output_type, expected_dir in test_cases: diff --git a/tests/cli/total/test_cli_total_wrapper.py b/tests/cli/total/test_cli_total_wrapper.py index fa0595cbf..7b83292fb 100644 --- a/tests/cli/total/test_cli_total_wrapper.py +++ b/tests/cli/total/test_cli_total_wrapper.py @@ -14,7 +14,7 @@ from scribe_data.cli.total.print_values import get_datatype_list from scribe_data.cli.total.query import query_total_lexemes from scribe_data.cli.total.wrapper import total_wrapper -from scribe_data.utils import DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, WIKIDATA_QIDS_PIDS_FILE +from scribe_data.utils import WIKIDATA_QIDS_PIDS_FILE try: with WIKIDATA_QIDS_PIDS_FILE.open("r", encoding="utf-8") as file: @@ -27,21 +27,21 @@ class TestCLITotalWrapper(unittest.TestCase): - @patch("scribe_data.cli.total.print_total_lexemes") + @patch("scribe_data.cli.total.wrapper.print_total_lexemes") def test_cli_total_wrapper_all_bool( self, mock_print_total_lexemes: MagicMock ) -> None: total_wrapper(all_bool=True) - mock_print_total_lexemes.assert_called_once_with() + mock_print_total_lexemes.assert_called_once() - @patch("scribe_data.cli.total.print_total_lexemes") + @patch("scribe_data.cli.total.wrapper.print_total_lexemes") def test_cli_total_wrapper_language_only( self, mock_print_total_lexemes: MagicMock ) -> None: total_wrapper(languages=["English"]) mock_print_total_lexemes.assert_called_once_with(language="English") - @patch("scribe_data.cli.total.query_total_lexemes") + @patch("scribe_data.cli.total.wrapper.query_total_lexemes") def test_cli_total_wrapper_language_and_data_type( self, mock_query_total_lexemes_lexemes: MagicMock ) -> None: @@ -56,7 +56,7 @@ def test_cli_total_wrapper_invalid_input(self) -> None: # MARK: Using Dump - @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + @patch("scribe_data.wikidata.wikidata_utils.parse_wd_lexeme_dump") def test_cli_total_wrapper_wikidata_dump_flag( self, mock_parse_dump: MagicMock ) -> None: @@ -65,13 +65,10 @@ def test_cli_total_wrapper_wikidata_dump_flag( """ total_wrapper(wikidata_dump=True) mock_parse_dump.assert_called_once_with( - languages=["all"], - data_types=["all"], - wikidata_dump_type=["total"], - wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, + languages=["all"], data_types=["all"], wikidata_dump_type=["total"] ) - @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + @patch("scribe_data.wikidata.wikidata_utils.parse_wd_lexeme_dump") def test_cli_total_wrapper_wikidata_dump_with_all( self, mock_parse_dump: MagicMock ) -> None: @@ -80,13 +77,10 @@ def test_cli_total_wrapper_wikidata_dump_with_all( """ total_wrapper(wikidata_dump=True, all_bool=True) mock_parse_dump.assert_called_once_with( - languages=["all"], - data_types=["all"], - wikidata_dump_type=["total"], - wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, + languages=["all"], data_types=["all"], wikidata_dump_type=["total"] ) - @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + @patch("scribe_data.wikidata.wikidata_utils.parse_wd_lexeme_dump") def test_cli_total_wrapper_wikidata_dump_with_language_and_type( self, mock_parse_dump: MagicMock ) -> None: @@ -99,16 +93,13 @@ def test_cli_total_wrapper_wikidata_dump_with_language_and_type( wikidata_dump=Path("/path/to/dump.json"), ) mock_parse_dump.assert_called_once_with( - languages=["English"], - data_types=["nouns"], - wikidata_dump_type=["total"], - wikidata_dump_path=Path("/path/to/dump.json"), + languages=["English"], data_types=["nouns"], wikidata_dump_type=["total"] ) # MARK: Using QID - @patch("scribe_data.cli.total.check_qid_is_language") - @patch("scribe_data.cli.total.print_total_lexemes") + @patch("scribe_data.utils.check_qid_is_language") + @patch("scribe_data.cli.total.print_values.print_total_lexemes") def test_cli_total_wrapper_with_qid( self, mock_print_total: MagicMock, mock_check_qid: MagicMock ) -> None: @@ -119,8 +110,8 @@ def test_cli_total_wrapper_with_qid( total_wrapper(languages=["Q9217"]) mock_print_total.assert_called_once_with(language="Q9217") - @patch("scribe_data.cli.total.check_qid_is_language") - @patch("scribe_data.cli.total.query_total_lexemes") + @patch("scribe_data.utils.check_qid_is_language") + @patch("scribe_data.cli.total.wrapper.query_total_lexemes") def test_cli_total_wrapper_with_qid_and_datatype( self, mock_query_total_lexemes: MagicMock, mock_check_qid: MagicMock ) -> None: @@ -133,7 +124,7 @@ def test_cli_total_wrapper_with_qid_and_datatype( language="Q9217", data_type="nouns" ) - @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + @patch("scribe_data.wikidata.wikidata_utils.parse_wd_lexeme_dump") def test_cli_total_wrapper_qid_with_wikidata_dump( self, mock_parse_dump: MagicMock ) -> None: @@ -142,13 +133,10 @@ def test_cli_total_wrapper_qid_with_wikidata_dump( """ total_wrapper(languages=["Q9217"], wikidata_dump=True, all_bool=True) mock_parse_dump.assert_called_once_with( - languages=["Q9217"], - data_types=["all"], - wikidata_dump_type=["total"], - wikidata_dump_path=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, + languages=["Q9217"], data_types=["all"], wikidata_dump_type=["total"] ) - @patch("scribe_data.cli.total.query_total_lexemes") + @patch("scribe_data.cli.total.wrapper.query_total_lexemes") def test_cli_total_wrapper_query_total_lexemes_with_qid( self, mock_query_total_lexemes: MagicMock ) -> None: @@ -162,7 +150,7 @@ def test_cli_total_wrapper_query_total_lexemes_with_qid( # MARK: Multiple Languages and Data Types - @patch("scribe_data.cli.total.query_total_lexemes") + @patch("scribe_data.cli.total.wrapper.query_total_lexemes") def test_cli_total_wrapper_multiple_languages( self, mock_query_total_lexemes: MagicMock ) -> None: @@ -180,7 +168,7 @@ def test_cli_total_wrapper_multiple_languages( ] mock_query_total_lexemes.assert_has_calls(expected_calls) - @patch("scribe_data.cli.total.query_total_lexemes") + @patch("scribe_data.cli.total.wrapper.query_total_lexemes") def test_cli_total_wrapper_multiple_data_types( self, mock_query_total_lexemes: MagicMock ) -> None: @@ -198,7 +186,7 @@ def test_cli_total_wrapper_multiple_data_types( ] mock_query_total_lexemes.assert_has_calls(expected_calls) - @patch("scribe_data.cli.total.query_total_lexemes") + @patch("scribe_data.cli.total.wrapper.query_total_lexemes") def test_total_wrapper_multiple_languages_and_types( self, mock_query_total_lexemes: MagicMock ) -> None: @@ -220,7 +208,7 @@ def test_total_wrapper_multiple_languages_and_types( # MARK: Error Handling - @patch("scribe_data.cli.total.sparql.query") + @patch("scribe_data.wikidata.wikidata_utils.sparql.query") def test_cli_query_total_lexemes_http_error(self, mock_query: MagicMock) -> None: """ Test handling of HTTPError when querying totals. @@ -238,7 +226,7 @@ def test_cli_query_total_lexemes_http_error(self, mock_query: MagicMock) -> None self.assertIsNone(result) mock_print.assert_any_call("Query failed after retries.") - @patch("scribe_data.cli.total.sparql.query") + @patch("scribe_data.wikidata.wikidata_utils.sparql.query") def test_cli_query_total_lexemes_incomplete_read( self, mock_query: MagicMock ) -> None: @@ -260,8 +248,8 @@ def test_cli_query_total_lexemes_incomplete_read( # MARK: Sub-language Handling - @patch("scribe_data.cli.total.get_datatype_list") - @patch("scribe_data.cli.total.query_total_lexemes") + @patch("scribe_data.cli.total.print_values.get_datatype_list") + @patch("scribe_data.cli.total.wrapper.query_total_lexemes") def test_cli_total_wrapper_print_total_lexemes_with_sublanguages( self, mock_query_total_lexemes: MagicMock, mock_get_datatypes: MagicMock ) -> None: @@ -290,8 +278,8 @@ def test_cli_total_wrapper_print_total_lexemes_with_sublanguages( # MARK: Data Type List Handling - @patch("scribe_data.cli.total.language_metadata") - @patch("scribe_data.cli.total.list_all_languages") + @patch("scribe_data.utils.language_metadata") + @patch("scribe_data.utils.list_all_languages") @patch("scribe_data.utils.WIKIDATA_QUERIES_ALL_DATA_DIR") def test_cli_get_datatype_list_with_sublanguages( self, @@ -335,10 +323,10 @@ def mock_path_handler(path: str) -> MagicMock: mock_dir.__truediv__.side_effect = mock_path_handler - result = get_datatype_list("norwegian") # note: lowercase + result = get_datatype_list("norwegian") self.assertEqual(sorted(result), ["nouns", "verbs"]) - @patch("scribe_data.cli.total.language_metadata") + @patch("scribe_data.utils.language_metadata") @patch("scribe_data.utils.WIKIDATA_QUERIES_ALL_DATA_DIR") def test_cli_get_datatype_list_empty_directory( self, mock_dir: MagicMock, mock_metadata: MagicMock @@ -355,7 +343,7 @@ def test_cli_get_datatype_list_empty_directory( with self.assertRaises(ValueError): get_datatype_list("English") - @patch("scribe_data.cli.total.query_total_lexemes") + @patch("scribe_data.cli.total.wrapper.query_total_lexemes") def test_cli_total_wrapper_with_invalid_language( self, mock_query_total_lexemes: MagicMock ) -> None: @@ -369,7 +357,7 @@ def test_cli_total_wrapper_with_invalid_language( mock_query_total_lexemes.assert_called_once() - @patch("scribe_data.cli.total.query_total_lexemes") + @patch("scribe_data.cli.total.wrapper.query_total_lexemes") def test_cli_total_wrapper_with_invalid_data_type( self, mock_query_total_lexemes: MagicMock ) -> None: diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..4c81b2775 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +""" +Utility functions for pytest tests. +""" + +import shutil +from pathlib import Path + +import pytest + +from scribe_data.utils import ( + DEFAULT_CONTRACTS_EXPORT_DIR, + DEFAULT_CSV_DIR, + DEFAULT_FILTERED_JSON_DIR, + DEFAULT_JSON_DIR, + DEFAULT_SQLITE_DIR, + DEFAULT_TSV_DIR, + DEFAULT_WIKIDATA_DUMP_DIR, + DEFAULT_WIKTIONARY_DUMP_DIR, + DEFAULT_WIKTIONARY_JSON_DIR, +) + + +def cleanup_default_directories() -> None: + """ + Utility function to safely remove default directories during testing. + """ + project_root = Path(__file__).parent.parent + dirs_to_delete = [ + DEFAULT_CONTRACTS_EXPORT_DIR, + DEFAULT_CSV_DIR, + DEFAULT_FILTERED_JSON_DIR, + DEFAULT_JSON_DIR, + DEFAULT_SQLITE_DIR, + DEFAULT_TSV_DIR, + DEFAULT_WIKIDATA_DUMP_DIR, + DEFAULT_WIKTIONARY_DUMP_DIR, + DEFAULT_WIKTIONARY_JSON_DIR, + ] + + for dir_name in dirs_to_delete: + target_dir = project_root / dir_name + if target_dir.exists() and target_dir.is_dir(): + shutil.rmtree(target_dir) + + +@pytest.fixture(scope="module", autouse=True) +def auto_cleanup_default_directories() -> None: + """ + Automatically cleans up test data after each file finishes. + """ + yield + cleanup_default_directories() diff --git a/tests/wikidata/test_wikidata_dump.py b/tests/wikidata/test_wikidata_dump.py index f56be809c..84e1bd7c7 100644 --- a/tests/wikidata/test_wikidata_dump.py +++ b/tests/wikidata/test_wikidata_dump.py @@ -7,7 +7,7 @@ import pytest -from scribe_data.utils import DEFAULT_WIKIDATA_DUMP_EXPORT_DIR +from scribe_data.utils import DEFAULT_WIKIDATA_DUMP_DIR from scribe_data.wikidata.parse_dump import LexemeProcessor, parse_dump from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump @@ -136,7 +136,7 @@ def test_wikidata_parse_wd_lexeme_dump( parse_type=["translations"], data_types=["nouns"], file_path=str(test_file_path), - output_dir=DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, + output_dir=DEFAULT_WIKIDATA_DUMP_DIR, overwrite_all=False, ) diff --git a/tests/wiktionary/test_wiktionary_parse_translations.py b/tests/wiktionary/test_wiktionary_parse_translations.py index 27650f2db..0c0b7cee5 100644 --- a/tests/wiktionary/test_wiktionary_parse_translations.py +++ b/tests/wiktionary/test_wiktionary_parse_translations.py @@ -7,6 +7,7 @@ import mwparserfromhell +from scribe_data.utils import DEFAULT_JSON_DIR from scribe_data.wiktionary.parse_constants import get_wiktionary_config from scribe_data.wiktionary.parse_translations import ( _extract_source_lang_section, @@ -464,7 +465,6 @@ def test_parse_wiktionary_translations_mock(self): """ translations are written to the expected JSON file on disk. """ - import shutil import tempfile from pathlib import Path @@ -489,21 +489,17 @@ def test_parse_wiktionary_translations_mock(self): tmp.write(dummy_xml_content) tmp_path = tmp.name - output_dir = Path(tempfile.mkdtemp()) - try: parse_wiktionary_translations( target_languages=["de"], wiktionary_dump_path=tmp_path, - output_dir=output_dir, overwrite=True, ) - self.assertTrue(output_dir.exists()) - de_file = output_dir / "english" / "de_translations_from_en.json" + self.assertTrue(DEFAULT_JSON_DIR.exists()) + de_file = DEFAULT_JSON_DIR / "english" / "de_translations_from_en.json" self.assertTrue(de_file.exists()) finally: - shutil.rmtree(output_dir) Path(tmp_path).unlink()