"""Tasks related to translating data with the google translation API"""
import html
import logging
import os
import time
from typing import Dict, List, Optional
from tamr_toolbox.enrichment.dictionary import TranslationDictionary
# Building our documentation requires access to all dependencies, including optional ones
# This environments variable is set automatically when `invoke doc_src` is used
BUILDING_DOCS = os.environ.get("TAMR_TOOLBOX_DOCS") == "1"
if BUILDING_DOCS:
# Import relevant optional dependencies
from google.cloud.translate_v2 import Client as GoogleTranslateClient
LOGGER = logging.getLogger(__name__)
def _check_valid_translation_language(
client: "GoogleTranslateClient", language: str, *, target_language: Optional[str] = None
) -> None:
"""
Checks that the provided language is an accepted google translation api language with an
option to specify the target language to check the source to target language combination
is supported
Args:
client: a google translate api client
language: the language to check
target_language: the target language to translate to
Returns:
"""
languages = client.get_languages(target_language=target_language)
valid_source_languages = [language["language"] for language in languages]
if language not in valid_source_languages:
if language != "auto":
if target_language is None:
error_message = (
f"Specified language {language} is not supported by the Google Translation API"
f"Valid languages are: {valid_source_languages}"
)
else:
error_message = (
f"Translation from {language} to {target_language} is not supported by the "
"Google Translation API. "
f"Valid source languages for {target_language} are: {valid_source_languages}"
)
LOGGER.error(error_message)
raise ValueError(error_message)
def _check_valid_translation_languages(
client: "GoogleTranslateClient", source_language: str, target_language: str
) -> None:
"""
Checks that the provided target and source language combination is an accepted translation
with the google translation api
Args:
client: a google translate api client
source_language: the target language to translate from
target_language: the target language to translate to
Returns:
"""
if target_language == "auto":
error_message = "'auto' is not a valid target language for translation"
LOGGER.error(error_message)
raise ValueError(error_message)
else:
_check_valid_translation_language(client, target_language)
if source_language == "auto":
LOGGER.info(
"Source language is set to 'auto', "
"the Google Translation API will automatically detect the source language"
)
else:
_check_valid_translation_language(client, source_language, target_language=target_language)
[docs]def translation_client_from_json(json_credential_path: str) -> "GoogleTranslateClient":
"""
Returns a Google translation client based on credentials stored in a Google credential
json file
Args:
json_credential_path: path to the google credential json file
Returns:
A Google Translate Client
"""
from google.cloud.translate_v2 import Client as GoogleTranslateClient
LOGGER.info("Connecting to Google Translation Client")
google_client = GoogleTranslateClient.from_service_account_json(json_credential_path)
return google_client
[docs]def translate(
phrases_to_translate: List[str],
client: "GoogleTranslateClient",
*,
source_language: str = "auto",
target_language: str = "en",
translation_model: str = "nmt",
num_of_tries: int = 4,
) -> Optional[Dict[str, TranslationDictionary]]:
"""
Translate a list of text to a target language using google's translation api
Args:
phrases_to_translate: list of phrases to translate from the source language to the
target language
client: location of the credentials JSON read by the google_api client
source_language: the language the text to translate is in, "auto" means the api_client
google_api api_client will try to detect the source language automatically
target_language: the language to translate into
translation_model: google_api api_client api_client model to use, "nmt" or "pbmt".
Choose "pbmt" if an "nmt" model doesn't exists for your source to target language pair
num_of_tries: number of times to try to translate if the translation call fails
Returns:
A toolbox translation dictionary.
None if the translation failed
"""
_check_valid_translation_languages(
client=client, target_language=target_language, source_language=source_language
)
if source_language == "auto":
source_language = None
num_attempts = 1
last_attempt = False
while num_attempts <= num_of_tries:
if num_attempts == num_of_tries and num_of_tries > 1:
LOGGER.warning(
"WARNING: Failed to translate current chunk of phrases %s times. Final try.",
num_of_tries - 1,
)
last_attempt = True
try:
response = client.translate(
target_language=target_language,
source_language=source_language,
model=translation_model,
values=phrases_to_translate,
)
if source_language is None:
returned_translation = {
translation["input"]: TranslationDictionary(
standardized_phrase=translation["input"],
translated_phrase=html.unescape(translation["translatedText"]),
detected_language=translation["detectedSourceLanguage"],
)
for translation in response
}
else:
returned_translation = {
translation["input"]: TranslationDictionary(
standardized_phrase=translation["input"],
translated_phrase=html.unescape(translation["translatedText"]),
)
for translation in response
}
LOGGER.debug(returned_translation)
return returned_translation
# TODO: check which exception returns the User Rate Limit error
# to better handle the exceptions
except Exception as excp:
if "User Rate Limit Exceeded" in str(excp) and not last_attempt:
LOGGER.warning(
"Google api_client API user rate limit exceeded, "
"waiting 10 seconds and retrying."
)
time.sleep(10)
num_attempts += 1
continue
else:
error_message = f"Could not translate current chunk of phrases. Error: {excp}"
LOGGER.error(error_message)
num_attempts += 1
continue
LOGGER.error("Ran out of number of tries. Skipping.")
return None