"""Tasks related to efficiently translating data not present in existing translation
dictionaries"""
import logging
import math
import os
from typing import Dict, List, Union
from tamr_toolbox.enrichment.api_client import google_translate
from tamr_toolbox.enrichment.dictionary import TranslationDictionary, save, update
from tamr_toolbox.enrichment.enrichment_utils import _yield_chunk
# Building our documentation requires access to all dependencies, including optional ones
# This environments variable is set automatically when `invoke docs` is used
BUILDING_DOCS = os.environ.get("TAMR_TOOLBOX_DOCS") == "1"
if BUILDING_DOCS:
# Import relevant optional dependencies
from google.cloud.translate_v2 import Client as GoogleTranslateClient
LOGGER = logging.getLogger(__name__)
def _filter_numeric_and_null_phrases(phrase: Union[str, None]) -> str:
"""
Transform None and numbers saved as text as empty strings
Args:
phrase: data to filter
Returns:
An empty string
Raises an error if the input is neither None nor a string
Raises:
TypeError: is the provided phrase is not of type string
"""
if phrase is None:
return ""
elif not isinstance(phrase, str):
error_message = (
f"{phrase} is not in text format. " f"Only text can be translated, check data type."
)
LOGGER.error(error_message)
raise TypeError(error_message)
elif phrase.isnumeric():
return ""
else:
return phrase
[docs]def standardize_phrases(original_phrases: List[str]) -> List[str]:
"""
Standardize phrases to translate to avoid re-translating previously translated phrases but
with different formating
Args:
original_phrases: List of phrases to standardize
Returns:
List of standardized text
"""
standardized_phrases = [
" ".join(_filter_numeric_and_null_phrases(phrase).lower().split())
for phrase in original_phrases
]
return standardized_phrases
[docs]def get_phrases_to_translate(
original_phrases: List[str], translation_dictionary: Dict[str, TranslationDictionary]
) -> List[str]:
"""
Find phrases not previously translated and initiate dictionary entry
Args:
original_phrases: list of phrases to translate
translation_dictionary: a translation dictionary
Returns:
List of standardized phrases not present as keys of the translation dictionary
"""
count_already_translated = 0
count_needing_translation = 0
for original, standard in zip(original_phrases, standardize_phrases(original_phrases)):
if standard in translation_dictionary.keys():
translation_dictionary[standard].original_phrases.add(original)
count_already_translated += 1
else:
translation_dictionary[standard] = TranslationDictionary(
original_phrases={original}, standardized_phrase=standard
)
count_needing_translation += 1
LOGGER.info(
f"From the {len(original_phrases)} sent for translation, "
f"{count_already_translated} can be translated with the dictionary and "
f"{count_needing_translation} need to be translated"
)
to_translate = [
t.standardized_phrase
for t in translation_dictionary.values()
if t.translated_phrase is None
]
LOGGER.debug(f"{to_translate}")
return to_translate
[docs]def from_list(
all_phrases: List[str],
client: "GoogleTranslateClient",
dictionary: Dict[str, TranslationDictionary],
*,
source_language: str = "auto",
target_language: str = "en",
chunk_size: int = 100,
translation_model: str = "nmt",
intermediate_save_every_n_chunks: Union[int, None] = None,
intermediate_save_to_disk: bool = False,
intermediate_folder: str = "/tmp",
) -> Dict[str, TranslationDictionary]:
"""
Translate a list of phrases from source language to target language.
The translation is saved in a dictionary on your local file system before updating the
main dictionary
Args:
all_phrases: List of standardized phrases to translate.
client: a google translate api client
dictionary: a toolbox translation dictionary
source_language: the language the text to translate is in, "auto" means the api_client
google_api api_client will try to detect the source language automatically
target_language: the language to translate into
chunk_size: number of phrases to translate per api_client calls, set too high and you
will hit API user rate limit errors
translation_model: google_api api_client api_client model to use, "nmt" or "pbmt".
Choose "pbmt" if an "nmt" model doesn't exists for your source to target language pair
intermediate_save_every_n_chunks: save periodically api_client dictionary to disk every n
chunk of phrases translated
intermediate_save_to_disk: decide whether to save periodically the dictionary to disk to
avoid loss of translation data if code breaks
intermediate_folder: path to folder where dictionary will be save periodically to avoid
loss of translation data
Returns:
The updated translation dictionary
Raises:
ValueError: if the argument chunk_size is set to 0
"""
if chunk_size == 0:
error_message = "Translation chunk size cannot be of size 0"
LOGGER.error(error_message)
raise ValueError(error_message)
if intermediate_save_every_n_chunks == 0 or intermediate_save_every_n_chunks is None:
intermediate_save_every_n_chunks = math.inf
unique_all_phrases = list(set(all_phrases))
nbr_of_unique_phrases = len(unique_all_phrases)
phrases_to_translate = get_phrases_to_translate(unique_all_phrases, dictionary)
number_phrases_to_translate = len(phrases_to_translate)
if number_phrases_to_translate == 0:
LOGGER.info("All phrases to translate are found in the local dictionary.")
else:
LOGGER.info(
f"Of the {nbr_of_unique_phrases} unique phrases to translate, "
f"{number_phrases_to_translate} were not found in the dictionary."
)
# Google has a translation rate limits
# to avoid hitting those the phrases are sent for translation in chunks
number_of_chunks = math.ceil(number_phrases_to_translate / chunk_size)
tmp_dictionary = {}
for ix, chunk_of_phrases in enumerate(_yield_chunk(phrases_to_translate, chunk_size)):
LOGGER.debug(f"Translating chunk {ix + 1} out of {number_of_chunks}.")
translated_phrases = google_translate.translate(
phrases_to_translate=chunk_of_phrases,
client=client,
source_language=source_language,
target_language=target_language,
translation_model=translation_model,
)
if translated_phrases is not None:
tmp_dictionary.update(translated_phrases)
if (ix % intermediate_save_every_n_chunks) == 0:
LOGGER.info("Saving intermediate outputs")
update(main_dictionary=dictionary, tmp_dictionary=tmp_dictionary)
if intermediate_save_to_disk:
save(
translation_dictionary=dictionary,
dictionary_folder=intermediate_folder,
target_language=target_language,
source_language=source_language,
)
# resetting temporary results after saving it
tmp_dictionary = {}
# update dictionary
update(main_dictionary=dictionary, tmp_dictionary=tmp_dictionary)
return dictionary