"""Tasks related to creating, updating, saving and moving translation dictionaries
in and out of Tamr"""
import json
import logging
import os
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Set, Union
from requests.exceptions import HTTPError
from tamr_unify_client.dataset.collection import DatasetCollection
from tamr_unify_client.dataset.resource import Dataset
from tamr_toolbox.enrichment.enrichment_utils import SetEncoder, create_empty_mapping
LOGGER = logging.getLogger(__name__)
[docs]@dataclass
class TranslationDictionary:
"""
A DataClass for translation dictionaries
Args:
standardized_phrase: The unique common standardized version of all original_phrases
translated_phrase: The translated standardized phrase to the target language of the
dictionary
detected_language: The language detected of the standardized phrase if source language is
set to auto
original_phrases: A set of original phrases which all convert to the standardized phrases
when applying standardization
"""
standardized_phrase: str = None
translated_phrase: str = None
detected_language: str = None
original_phrases: Set[str] = field(default_factory=set)
[docs]def filename(
dictionary_folder: Union[str, Path],
*,
target_language: str = "en",
source_language: str = "auto",
) -> str:
"""
Generate a toolbox translation dictionary file path
Args:
dictionary_folder: base directory where dictionaries are saved
target_language: the language to translate into, for a list of allowed inputs:
https://cloud.google.com/translate/docs/basic/discovering-supported-languages
source_language: the language the text to translate is in, if None, assumes it is "auto"
Returns:
A toolbox translation dictionary file path
"""
if source_language is None:
source_language = "auto"
dictionary_name = f"dictionary_{source_language.lower()}_to_{target_language.lower()}.json"
if isinstance(dictionary_folder, str):
dictionary_folder = Path(dictionary_folder)
return str(dictionary_folder / dictionary_name)
[docs]def create(
dictionary_folder: str, *, target_language: str = "en", source_language: str = "auto"
) -> str:
"""
Create an empty dictionary on disk
Args:
dictionary_folder: base directory where dictionary is saved
target_language: the language to translate into, for a list of allowed inputs:
https://cloud.google.com/translate/docs/basic/discovering-supported-languages
source_language: the language the text to translate is in, if None, assumes it is "auto"
Returns:
A path to a dictionary
"""
dictionary_filepath = filename(
dictionary_folder, target_language=target_language, source_language=source_language
)
create_empty_mapping(dictionary_filepath)
return dictionary_filepath
[docs]def to_json(dictionary: Dict[str, TranslationDictionary]) -> List[str]:
"""
Convert a toolbox translation dictionary entries to a json format where set object are
converted to list
Args:
dictionary: a toolbox translation dictionary
Returns:
A list of toolbox translation dictionary entries in json format
"""
return [json.dumps(asdict(t), cls=SetEncoder) for t in dictionary.values()]
[docs]def to_dict(dictionary: Dict[str, TranslationDictionary]) -> List[Dict[str, Union[str, List]]]:
"""
Convert a toolbox translation dictionary entries to a dictionary format where set object are
converted to list
Args:
dictionary: a toolbox translation dictionary
Returns:
A list of toolbox translation dictionary entries in dictionary format
"""
return [json.loads(json.dumps(asdict(t), cls=SetEncoder)) for t in dictionary.values()]
[docs]def save(
translation_dictionary: Dict[str, TranslationDictionary],
dictionary_folder: str,
*,
target_language: str = "en",
source_language: str = "auto",
) -> None:
"""
Save a toolbox translation dictionary to disk
Args:
translation_dictionary: dictionary object to be saved to disk
dictionary_folder: base directory where dictionary is saved
target_language: the language to translate into, for a list of allowed inputs:
https://cloud.google.com/translate/docs/basic/discovering-supported-languages
source_language: the language the text to translate is in, if None, assumes it is "auto"
Returns:
"""
dictionary_filepath = filename(
dictionary_folder, target_language=target_language, source_language=source_language
)
if len(translation_dictionary) > 0:
LOGGER.debug("Writing Dictionary to file")
with open(dictionary_filepath, "w") as f:
f.write("\n".join(to_json(translation_dictionary)))
[docs]def load(
dictionary_folder: str, *, target_language: str = "en", source_language: str = "auto"
) -> Dict[str, TranslationDictionary]:
"""
Load a toolbox translation dictionary from disk to memory
Args:
dictionary_folder: base directory where dictionary is saved
target_language: the language to translate into, for a list of allowed inputs:
https://cloud.google.com/translate/docs/basic/discovering-supported-languages
source_language: the language the text to translate is in, if None, assumes it is "auto"
Returns:
A toolbox translation dictionary
Raises:
RuntimeError: if the dictionary was found on disk but is not of a valid
toolbox translation dictionary type
"""
dictionary_filepath = filename(
dictionary_folder, target_language=target_language, source_language=source_language
)
if not os.path.exists(dictionary_filepath):
LOGGER.info(f"Dictionary {dictionary_filepath} does not exists, creating an empty one.")
dictionary_filepath = create(
dictionary_folder, target_language=target_language, source_language=source_language
)
with open(dictionary_filepath, "r") as f:
translation_dictionary = [json.loads(line) for line in f.readlines()]
try:
# Tranform the loaded dictionaries into a TranslationDictionary
translation_dictionary = [TranslationDictionary(**t) for t in translation_dictionary]
# Change original phrases from List to Set
for dictionary in translation_dictionary:
dictionary.original_phrases = set(dictionary.original_phrases)
# Make the standardized phrase the main key of the translation dictionary to be able
# to access each translation easily
translation_dictionary = {
t.standardized_phrase: t
for t in translation_dictionary
if t.standardized_phrase is not None
}
except Exception as e:
error_message = (
f"Could not read translation dictionary at {dictionary_filepath}. "
f"Check that the dictionary is of the correct type. Error: {e}"
)
LOGGER.error(error_message)
raise RuntimeError(error_message)
return translation_dictionary
[docs]def update(
main_dictionary: Dict[str, TranslationDictionary],
tmp_dictionary: Dict[str, TranslationDictionary],
) -> None:
"""
Update a toolbox translation dictionary with another temporary translation dictionary
Args:
main_dictionary: the main toolbox translation dictionary containing past
translation results
tmp_dictionary: a temporary toolbox translation dictionary containing new translation
Returns:
"""
for standardized_phrase, translation in tmp_dictionary.items():
try:
main_dictionary_entry = main_dictionary[standardized_phrase]
main_dictionary_entry.translated_phrase = translation.translated_phrase
main_dictionary_entry.detected_language = translation.detected_language
except KeyError:
main_dictionary[standardized_phrase] = TranslationDictionary(
standardized_phrase=standardized_phrase,
translated_phrase=translation.translated_phrase,
detected_language=translation.detected_language,
original_phrases=translation.original_phrases,
)
[docs]def convert_to_mappings(dictionary: Dict[str, TranslationDictionary]) -> Dict[str, str]:
"""
Transform a translation dictionary into a mapping of original phrases to translated phrases
Args:
dictionary: a toolbox translation dictionary
Returns:
a dictionary with original phrase as key and translate phrase as value
"""
mapping_from_dictionary = {
original_phrase: t.translated_phrase
for t in dictionary.values()
for original_phrase in t.original_phrases
}
return mapping_from_dictionary
[docs]def from_dataset(dataset: Dataset) -> Dict[str, TranslationDictionary]:
"""
Stream a dictionary from Tamr
Args:
dataset: Tamr Dataset object
Returns:
A toolbox translation dictionary
Raises:
ValueError: if the provided `dataset` is not a toolbox translation dictionary dataset
NameError: if the provided `dataset` does not contain all the attributes of a
toolbox translation dictionary
RuntimeError: if there is any other problem while reading the `dataset` as a
toolbox translation dictionary
"""
if dataset.key_attribute_names[0] != "standardized_phrase":
error_message = f"Provided Tamr Dataset is not a toolbox translation dictionary"
LOGGER.error(error_message)
raise ValueError(error_message)
dictionary = {}
for record in dataset.records():
try:
entry = TranslationDictionary(**record)
# values are returned as a list of a single string, we change this to string
entry.translated_phrase = entry.translated_phrase[0]
entry.detected_language = entry.detected_language[0]
# original phrases are stored on Tamr as lists, we save it as a set
entry.original_phrases = set(entry.original_phrases)
except NameError as e:
error_message = (
f"Supplied Tamr dataset is not in a toolbox translation dictionary format: {e}"
)
LOGGER.error(error_message)
raise NameError(error_message)
except Exception as e:
error_message = f"Error while reading the Tamr dataset translation dictionary: {e}"
LOGGER.error(error_message)
raise RuntimeError(error_message)
formatted_dictionary = {entry.standardized_phrase: entry}
dictionary.update(formatted_dictionary)
return dictionary
[docs]def to_dataset(
dictionary: Dict[str, TranslationDictionary],
*,
dataset: Optional[Dataset] = None,
datasets_collection: Optional[DatasetCollection] = None,
target_language: Optional[str] = None,
source_language: Optional[str] = None,
create_dataset: bool = False,
) -> str:
"""
Ingest a toolbox dictionary in Tamr, creates the source dataset if it doesn't exists
Args:
dictionary: a toolbox translation dictionary
dataset: a Tamr client dataset
datasets_collection: a Tamr client datasets collection
target_language: the target language of the given dictionary
source_language: the source language of the given dictionary
create_dataset: flag to create or upsert to an existing translation dictionary
source dataset
Returns:
The name of the created or updated Tamr Dataset
Raises:
ValueError: if `create_dataset` is False and `dataset` is not provided or is not a
toolbox translation dictionary dataset.
If `create_dataset` is True but `datasets_collection` or `target_language` or
`source_language` is missing or the Tamr dataset already exists
RuntimeError: if there is an error during the creation of the Tamr dataset attributes
"""
if create_dataset is False:
if dataset is None:
error_message = (
"Tamr Client Dataset missing from inputs, please provide a Tamr "
"Client Dataset if updating an existing translation dictionary dataset"
)
LOGGER.error(error_message)
raise ValueError(error_message)
if dataset.key_attribute_names[0] != "standardized_phrase":
error_message = f"Provided Tamr Dataset is not a toolbox translation dictionary"
LOGGER.error(error_message)
raise ValueError(error_message)
else:
if not (datasets_collection and target_language and source_language):
error_message = (
"A Tamr Datasets Collection, target_language and source_language "
"must all be inputs if creating the toolbox translation "
"dictionary dataset"
)
LOGGER.error(error_message)
raise ValueError(error_message)
# Get dataset name using filename function
# The value of dictionary folder here is unimportant
dataset_name = os.path.basename(
filename(
dictionary_folder="not/a/real/path", # will be dropped immediately
target_language=target_language,
source_language=source_language,
)
)
if dataset_name in [d.name for d in datasets_collection]:
error_message = (
f"Tamr Dataset {dataset_name} already exists on Tamr, you cannot "
f"create a dataset with the same name as another one"
)
LOGGER.error(error_message)
raise ValueError(error_message)
LOGGER.info(f"Creating toolbox translation dictionary dataset {dataset_name} on Tamr")
creation_spec = {"name": dataset_name, "keyAttributeNames": ["standardized_phrase"]}
dataset = datasets_collection.create(creation_spec)
attributes = dataset.attributes
for attribute in ["translated_phrase", "detected_language", "original_phrases"]:
attr_spec = {
"name": attribute,
"type": {"baseType": "ARRAY", "innerType": {"baseType": "STRING"}},
}
try:
attributes.create(attr_spec)
except HTTPError as e:
error_message = (
f"Error while creating attribute {attribute} for dataset {dataset_name}: {e}"
)
LOGGER.error(error_message)
raise RuntimeError(error_message)
LOGGER.info("Ingesting toolbox translation dictionary to Tamr")
dataset.upsert_records(records=to_dict(dictionary), primary_key_name="standardized_phrase")
return dataset.name