Source code for tamr_toolbox.enrichment.api_client.google_translate

"""Tasks related to translating data with the google translation API"""
import html
import logging
import os
import time
from typing import Dict, List, Optional

from tamr_toolbox.enrichment.dictionary import TranslationDictionary

# Building our documentation requires access to all dependencies, including optional ones
# This environments variable is set automatically when `invoke doc_src` is used
BUILDING_DOCS = os.environ.get("TAMR_TOOLBOX_DOCS") == "1"
if BUILDING_DOCS:
    # Import relevant optional dependencies
    from google.cloud.translate_v2 import Client as GoogleTranslateClient

LOGGER = logging.getLogger(__name__)


def _check_valid_translation_language(
    client: "GoogleTranslateClient", language: str, *, target_language: Optional[str] = None
) -> None:
    """
    Checks that the provided language is an accepted google translation api language with an
    option to specify the target language to check the source to target language combination
    is supported

    Args:
        client: a google translate api client
        language: the language to check
        target_language: the target language to translate to

    Returns:

    """
    languages = client.get_languages(target_language=target_language)
    valid_source_languages = [language["language"] for language in languages]
    if language not in valid_source_languages:
        if language != "auto":
            if target_language is None:
                error_message = (
                    f"Specified language {language} is not supported by the Google Translation API"
                    f"Valid languages are: {valid_source_languages}"
                )
            else:
                error_message = (
                    f"Translation from {language} to {target_language} is not supported by the "
                    "Google Translation API. "
                    f"Valid source languages for {target_language} are: {valid_source_languages}"
                )
            LOGGER.error(error_message)
            raise ValueError(error_message)


def _check_valid_translation_languages(
    client: "GoogleTranslateClient", source_language: str, target_language: str
) -> None:
    """
    Checks that the provided target and source language combination is an accepted translation
    with the google translation api

    Args:
        client: a google translate api client
        source_language: the target language to translate from
        target_language: the target language to translate to

    Returns:

    """
    if target_language == "auto":
        error_message = "'auto' is not a valid target language for translation"
        LOGGER.error(error_message)
        raise ValueError(error_message)
    else:
        _check_valid_translation_language(client, target_language)

    if source_language == "auto":
        LOGGER.info(
            "Source language is set to 'auto', "
            "the Google Translation API will automatically detect the source language"
        )
    else:
        _check_valid_translation_language(client, source_language, target_language=target_language)


[docs]def translation_client_from_json(json_credential_path: str) -> "GoogleTranslateClient": """ Returns a Google translation client based on credentials stored in a Google credential json file Args: json_credential_path: path to the google credential json file Returns: A Google Translate Client """ from google.cloud.translate_v2 import Client as GoogleTranslateClient LOGGER.info("Connecting to Google Translation Client") google_client = GoogleTranslateClient.from_service_account_json(json_credential_path) return google_client
[docs]def translate( phrases_to_translate: List[str], client: "GoogleTranslateClient", *, source_language: str = "auto", target_language: str = "en", translation_model: str = "nmt", num_of_tries: int = 4, ) -> Optional[Dict[str, TranslationDictionary]]: """ Translate a list of text to a target language using google's translation api Args: phrases_to_translate: list of phrases to translate from the source language to the target language client: location of the credentials JSON read by the google_api client source_language: the language the text to translate is in, "auto" means the api_client google_api api_client will try to detect the source language automatically target_language: the language to translate into translation_model: google_api api_client api_client model to use, "nmt" or "pbmt". Choose "pbmt" if an "nmt" model doesn't exists for your source to target language pair num_of_tries: number of times to try to translate if the translation call fails Returns: A toolbox translation dictionary. None if the translation failed """ _check_valid_translation_languages( client=client, target_language=target_language, source_language=source_language ) if source_language == "auto": source_language = None num_attempts = 1 last_attempt = False while num_attempts <= num_of_tries: if num_attempts == num_of_tries and num_of_tries > 1: LOGGER.warning( "WARNING: Failed to translate current chunk of phrases %s times. Final try.", num_of_tries - 1, ) last_attempt = True try: response = client.translate( target_language=target_language, source_language=source_language, model=translation_model, values=phrases_to_translate, ) if source_language is None: returned_translation = { translation["input"]: TranslationDictionary( standardized_phrase=translation["input"], translated_phrase=html.unescape(translation["translatedText"]), detected_language=translation["detectedSourceLanguage"], ) for translation in response } else: returned_translation = { translation["input"]: TranslationDictionary( standardized_phrase=translation["input"], translated_phrase=html.unescape(translation["translatedText"]), ) for translation in response } LOGGER.debug(returned_translation) return returned_translation # TODO: check which exception returns the User Rate Limit error # to better handle the exceptions except Exception as excp: if "User Rate Limit Exceeded" in str(excp) and not last_attempt: LOGGER.warning( "Google api_client API user rate limit exceeded, " "waiting 10 seconds and retrying." ) time.sleep(10) num_attempts += 1 continue else: error_message = f"Could not translate current chunk of phrases. Error: {excp}" LOGGER.error(error_message) num_attempts += 1 continue LOGGER.error("Ran out of number of tries. Skipping.") return None