Source code for tamr_toolbox.enrichment.translate

"""Tasks related to efficiently translating data not present in existing translation
dictionaries"""
import logging
import math
import os
from typing import Dict, List, Union

from tamr_toolbox.enrichment.api_client import google_translate
from tamr_toolbox.enrichment.dictionary import TranslationDictionary, save, update
from tamr_toolbox.enrichment.enrichment_utils import _yield_chunk

# Building our documentation requires access to all dependencies, including optional ones
# This environments variable is set automatically when `invoke docs` is used
BUILDING_DOCS = os.environ.get("TAMR_TOOLBOX_DOCS") == "1"
if BUILDING_DOCS:
    # Import relevant optional dependencies
    from google.cloud.translate_v2 import Client as GoogleTranslateClient


LOGGER = logging.getLogger(__name__)


def _filter_numeric_and_null_phrases(phrase: Union[str, None]) -> str:
    """
    Transform None and numbers saved as text as empty strings

    Args:
        phrase: data to filter

    Returns:
        An empty string
        Raises an error if the input is neither None nor a string

    Raises:
        TypeError: is the provided phrase is not of type string
    """
    if phrase is None:
        return ""
    elif not isinstance(phrase, str):
        error_message = (
            f"{phrase} is not in text format. " f"Only text can be translated, check data type."
        )
        LOGGER.error(error_message)
        raise TypeError(error_message)
    elif phrase.isnumeric():
        return ""
    else:
        return phrase


[docs]def standardize_phrases(original_phrases: List[str]) -> List[str]: """ Standardize phrases to translate to avoid re-translating previously translated phrases but with different formating Args: original_phrases: List of phrases to standardize Returns: List of standardized text """ standardized_phrases = [ " ".join(_filter_numeric_and_null_phrases(phrase).lower().split()) for phrase in original_phrases ] return standardized_phrases
[docs]def get_phrases_to_translate( original_phrases: List[str], translation_dictionary: Dict[str, TranslationDictionary] ) -> List[str]: """ Find phrases not previously translated and initiate dictionary entry Args: original_phrases: list of phrases to translate translation_dictionary: a translation dictionary Returns: List of standardized phrases not present as keys of the translation dictionary """ count_already_translated = 0 count_needing_translation = 0 for original, standard in zip(original_phrases, standardize_phrases(original_phrases)): if standard in translation_dictionary.keys(): translation_dictionary[standard].original_phrases.add(original) count_already_translated += 1 else: translation_dictionary[standard] = TranslationDictionary( original_phrases={original}, standardized_phrase=standard ) count_needing_translation += 1 LOGGER.info( f"From the {len(original_phrases)} sent for translation, " f"{count_already_translated} can be translated with the dictionary and " f"{count_needing_translation} need to be translated" ) to_translate = [ t.standardized_phrase for t in translation_dictionary.values() if t.translated_phrase is None ] LOGGER.debug(f"{to_translate}") return to_translate
[docs]def from_list( all_phrases: List[str], client: "GoogleTranslateClient", dictionary: Dict[str, TranslationDictionary], *, source_language: str = "auto", target_language: str = "en", chunk_size: int = 100, translation_model: str = "nmt", intermediate_save_every_n_chunks: Union[int, None] = None, intermediate_save_to_disk: bool = False, intermediate_folder: str = "/tmp", ) -> Dict[str, TranslationDictionary]: """ Translate a list of phrases from source language to target language. The translation is saved in a dictionary on your local file system before updating the main dictionary Args: all_phrases: List of standardized phrases to translate. client: a google translate api client dictionary: a toolbox translation dictionary source_language: the language the text to translate is in, "auto" means the api_client google_api api_client will try to detect the source language automatically target_language: the language to translate into chunk_size: number of phrases to translate per api_client calls, set too high and you will hit API user rate limit errors translation_model: google_api api_client api_client model to use, "nmt" or "pbmt". Choose "pbmt" if an "nmt" model doesn't exists for your source to target language pair intermediate_save_every_n_chunks: save periodically api_client dictionary to disk every n chunk of phrases translated intermediate_save_to_disk: decide whether to save periodically the dictionary to disk to avoid loss of translation data if code breaks intermediate_folder: path to folder where dictionary will be save periodically to avoid loss of translation data Returns: The updated translation dictionary Raises: ValueError: if the argument chunk_size is set to 0 """ if chunk_size == 0: error_message = "Translation chunk size cannot be of size 0" LOGGER.error(error_message) raise ValueError(error_message) if intermediate_save_every_n_chunks == 0 or intermediate_save_every_n_chunks is None: intermediate_save_every_n_chunks = math.inf unique_all_phrases = list(set(all_phrases)) nbr_of_unique_phrases = len(unique_all_phrases) phrases_to_translate = get_phrases_to_translate(unique_all_phrases, dictionary) number_phrases_to_translate = len(phrases_to_translate) if number_phrases_to_translate == 0: LOGGER.info("All phrases to translate are found in the local dictionary.") else: LOGGER.info( f"Of the {nbr_of_unique_phrases} unique phrases to translate, " f"{number_phrases_to_translate} were not found in the dictionary." ) # Google has a translation rate limits # to avoid hitting those the phrases are sent for translation in chunks number_of_chunks = math.ceil(number_phrases_to_translate / chunk_size) tmp_dictionary = {} for ix, chunk_of_phrases in enumerate(_yield_chunk(phrases_to_translate, chunk_size)): LOGGER.debug(f"Translating chunk {ix + 1} out of {number_of_chunks}.") translated_phrases = google_translate.translate( phrases_to_translate=chunk_of_phrases, client=client, source_language=source_language, target_language=target_language, translation_model=translation_model, ) if translated_phrases is not None: tmp_dictionary.update(translated_phrases) if (ix % intermediate_save_every_n_chunks) == 0: LOGGER.info("Saving intermediate outputs") update(main_dictionary=dictionary, tmp_dictionary=tmp_dictionary) if intermediate_save_to_disk: save( translation_dictionary=dictionary, dictionary_folder=intermediate_folder, target_language=target_language, source_language=source_language, ) # resetting temporary results after saving it tmp_dictionary = {} # update dictionary update(main_dictionary=dictionary, tmp_dictionary=tmp_dictionary) return dictionary