Enrichment

Translation

The tamr-toolbox provides functions to translate standardized data and store it in dictionaries making sure that data is not translated twice. These translation capabilities can be applied to a Tamr dataset or a Pandas DataFrame.

Translate data within a pandas DataFrame and save dictionaries on disk

"""An example script to translate data from disk and save results on disk"""
from typing import List

import tamr_toolbox as tbox
import pandas as pd

import argparse


def main(
    json_credential_path: str,
    dictionary_folder: str,
    attributes_to_translate: List[str],
    path_to_csv_to_translate: str,
    path_to_translated_csv: str,
) -> None:
    """
    Translate data located on disk and save results to disk
    Args:
        json_credential_path: path to the json file containing Google Translate API keys
        dictionary_folder: Path to the folder on disk where local versions of dictionary are saved
        attributes_to_translate: List of attributes from the local csv file to translate
        path_to_csv_to_translate: Path to the CSV file to translate
        path_to_translated_csv: path to the CSV file with translated data

    Returns:

    """
    # make Google api translation client
    google = tbox.enrichment.api_client.google.translation_client_from_json(json_credential_path)

    # read csv file from disk
    df = pd.read_csv(path_to_csv_to_translate, dtype=object)

    # load dictionary
    LOGGER.info(f"Starting translation from french to english")
    dictionary = tbox.enrichment.dictionary.load(
        dictionary_folder=dictionary_folder, target_language="en", source_language="fr",
    )

    # translate attribute by attribute
    for attribute in attributes_to_translate:
        LOGGER.info(f"Translating attribute: {attribute}")
        dictionary = tbox.enrichment.translate.from_list(
            all_phrases=df[attribute].unique().tolist(),
            client=google,
            dictionary=dictionary,
            target_language="en",
            source_language="fr",
            intermediate_save_every_n_chunks=100,
            intermediate_save_to_disk=True,
            intermediate_folder=dictionary_folder,
        )

    # save to disk new dictionary with added translation
    LOGGER.info(f"Finished translation from french to english")
    LOGGER.info(f"Saving updated dictionary to disk")
    tbox.enrichment.dictionary.save(
        translation_dictionary=dictionary,
        dictionary_folder=dictionary_folder,
        target_language="en",
        source_language="fr",
    )

    # Translating dataframe insitu
    LOGGER.info(f"Translating dataframe from french to english")
    LOGGER.debug("Converting dictionary to mapping of original to translated phrases")
    dictionary = tbox.enrichment.dictionary.convert_to_mappings(dictionary)

    for attribute in attributes_to_translate:
        LOGGER.info(f"Translating attribute {attribute} from french to english")
        df[attribute + "_translated"] = df[attribute].map(dictionary)

    # Then save dataframe to disk
    df.to_csv(path_to_translated_csv, index=False)


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/dataset.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        json_credential_path=CONFIG["translation"]["json_credential_path"],
        dictionary_folder=CONFIG["path_to_dictionary_folder"],
        attributes_to_translate=CONFIG["translation"]["attributes"],
        path_to_csv_to_translate="path_to_my_data.csv",
        path_to_translated_csv="path_to_my_translated_data.csv",
    )

Translate data from Tamr and update dictionary saved as a source dataset on Tamr

"""An example script to translate data from Tamr and save results in Tamr"""
from typing import Dict, Any

import tamr_toolbox as tbox

import argparse


def main(
    *,
    instance_connection_info: Dict[str, Any],
    unified_dataset_id: str,
    dictionary_dataset_id: str,
) -> None:
    """
    Translate data streamed from Tamr and save results on Tamr

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        unified_dataset_id: id of the Tamr unified dataset containing the data to translate
        dictionary_dataset_id: id of the Tamr toolbox translation dictionary dataset

    Returns:

    """
    # make Tamr Client, make Google api translation client
    tamr = tbox.utils.client.create(**instance_connection_info)
    google = tbox.enrichment.api_client.google.translation_client_from_json(
        json_credential_path=CONFIG["translation"]["json_credential_path"]
    )

    # list attributes to translate
    attributes_to_translate = CONFIG["translation"]["attributes"]

    # get dataframe from Tamr unified dataset: best is to pass a delta dataset where
    # only untranslated data is kept.
    # To do this setup a SM project connected to your current translated UD and filter to records
    # with null values in the translated attributes.
    dataset = tamr.datasets.by_resource_id(unified_dataset_id)
    df = tbox.data_io.dataframe.from_dataset(
        dataset, columns=attributes_to_translate, flatten_delimiter=" | "
    )

    # stream dictionary from Tamr. Dictionaries should follow the TranslationDictionary class of
    # the toolbox: "standardized_phrase" (str), "translated_phrase" (str),
    # "detected_language" (str), "original_phrases" (List[str])
    dictionary_dataset = tamr.datasets.by_resource_id(dictionary_dataset_id)
    dictionary = tbox.enrichment.dictionary.from_dataset(dictionary_dataset)

    for column in df.columns:
        LOGGER.info(f"Translating attribute: {column}")
        dictionary = tbox.enrichment.translate.from_list(
            all_phrases=df[column].unique().tolist(),
            client=google,
            dictionary=dictionary,
            source_language="fr",
            target_language="en",
        )

    # update dictionary on Tamr
    dataset_name = tbox.enrichment.dictionary.to_dataset(
        dictionary=dictionary, dataset=dictionary_dataset,
    )
    LOGGER.info(f"Tamr dataset {dataset_name} updated with new translation data")
    LOGGER.info("Script complete.")


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/dataset.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        unified_dataset_id=CONFIG["datasets"]["my_mastering_project_dataset"]["id"],
        dictionary_dataset_id=CONFIG["datasets"]["my_dictionary"]["id"],
    )

Initiate a translation dictionary on Tamr

"""An example script to create an empty translation dictionary on Tamr"""
from typing import Dict, Any
import tamr_toolbox as tbox
import argparse


def main(
    *,
    instance_connection_info: Dict[str, Any],
    dictionary_folder: str,
    source_language: str,
    target_language: str,
) -> None:
    """
    Create an empty toolbox translation dictionary dataset on Tamr
    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        dictionary_folder: Path to the folder on disk where local versions of dictionary are saved
        source_language: Source language of the dictionary
        target_language: Target language of the dictionary

    Returns:

    """
    # Connect to tamr
    tamr = tbox.utils.client.create(**instance_connection_info)

    # Create an empty dictionary on Tamr or load existing dictionary
    LOGGER.info(
        f"Initiating empty translation dictionary from source language {source_language} "
        f"to target language {target_language}"
    )

    dictionary = tbox.enrichment.dictionary.load(
        dictionary_folder=dictionary_folder,
        target_language=target_language,
        source_language=source_language,
    )

    if len(dictionary) > 0:
        error_message = (
            f"Warning: dictionary from {source_language} to {target_language} in "
            f"{dictionary_folder} already exists and is not empty"
        )
        LOGGER.warning(error_message)

    dataset_name = tbox.enrichment.dictionary.to_dataset(
        dictionary=dictionary,
        datasets_collection=tamr.datasets,
        target_language=target_language,
        source_language=source_language,
        create_dataset=True,
    )
    LOGGER.info(f"{dataset_name} created as a source dataset on Tamr")


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    parser.add_argument("--source", help="source language to translate from", required=True)
    parser.add_argument("--target", help="target language to translate to", required=True)

    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/dataset.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        dictionary_folder=CONFIG["translation"]["my_dictionary_folder"],
        source_language=args.source,
        target_language=args.target,
    )