Enrichment¶
Translation¶
The tamr-toolbox provides functions to translate standardized data and store it in dictionaries making sure that data is not translated twice. These translation capabilities can be applied to a Tamr dataset or a Pandas DataFrame.
Translate data within a pandas DataFrame and save dictionaries on disk¶
"""An example script to translate data from disk and save results on disk"""
from typing import List
import tamr_toolbox as tbox
import pandas as pd
import argparse
def main(
json_credential_path: str,
dictionary_folder: str,
attributes_to_translate: List[str],
path_to_csv_to_translate: str,
path_to_translated_csv: str,
) -> None:
"""
Translate data located on disk and save results to disk
Args:
json_credential_path: path to the json file containing Google Translate API keys
dictionary_folder: Path to the folder on disk where local versions of dictionary are saved
attributes_to_translate: List of attributes from the local csv file to translate
path_to_csv_to_translate: Path to the CSV file to translate
path_to_translated_csv: path to the CSV file with translated data
Returns:
"""
# make Google api translation client
google = tbox.enrichment.api_client.google.translation_client_from_json(json_credential_path)
# read csv file from disk
df = pd.read_csv(path_to_csv_to_translate, dtype=object)
# load dictionary
LOGGER.info(f"Starting translation from french to english")
dictionary = tbox.enrichment.dictionary.load(
dictionary_folder=dictionary_folder, target_language="en", source_language="fr",
)
# translate attribute by attribute
for attribute in attributes_to_translate:
LOGGER.info(f"Translating attribute: {attribute}")
dictionary = tbox.enrichment.translate.from_list(
all_phrases=df[attribute].unique().tolist(),
client=google,
dictionary=dictionary,
target_language="en",
source_language="fr",
intermediate_save_every_n_chunks=100,
intermediate_save_to_disk=True,
intermediate_folder=dictionary_folder,
)
# save to disk new dictionary with added translation
LOGGER.info(f"Finished translation from french to english")
LOGGER.info(f"Saving updated dictionary to disk")
tbox.enrichment.dictionary.save(
translation_dictionary=dictionary,
dictionary_folder=dictionary_folder,
target_language="en",
source_language="fr",
)
# Translating dataframe insitu
LOGGER.info(f"Translating dataframe from french to english")
LOGGER.debug("Converting dictionary to mapping of original to translated phrases")
dictionary = tbox.enrichment.dictionary.convert_to_mappings(dictionary)
for attribute in attributes_to_translate:
LOGGER.info(f"Translating attribute {attribute} from french to english")
df[attribute + "_translated"] = df[attribute].map(dictionary)
# Then save dataframe to disk
df.to_csv(path_to_translated_csv, index=False)
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/dataset.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
json_credential_path=CONFIG["translation"]["json_credential_path"],
dictionary_folder=CONFIG["path_to_dictionary_folder"],
attributes_to_translate=CONFIG["translation"]["attributes"],
path_to_csv_to_translate="path_to_my_data.csv",
path_to_translated_csv="path_to_my_translated_data.csv",
)
Translate data from Tamr and update dictionary saved as a source dataset on Tamr¶
"""An example script to translate data from Tamr and save results in Tamr"""
from typing import Dict, Any
import tamr_toolbox as tbox
import argparse
def main(
*,
instance_connection_info: Dict[str, Any],
unified_dataset_id: str,
dictionary_dataset_id: str,
) -> None:
"""
Translate data streamed from Tamr and save results on Tamr
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
unified_dataset_id: id of the Tamr unified dataset containing the data to translate
dictionary_dataset_id: id of the Tamr toolbox translation dictionary dataset
Returns:
"""
# make Tamr Client, make Google api translation client
tamr = tbox.utils.client.create(**instance_connection_info)
google = tbox.enrichment.api_client.google.translation_client_from_json(
json_credential_path=CONFIG["translation"]["json_credential_path"]
)
# list attributes to translate
attributes_to_translate = CONFIG["translation"]["attributes"]
# get dataframe from Tamr unified dataset: best is to pass a delta dataset where
# only untranslated data is kept.
# To do this setup a SM project connected to your current translated UD and filter to records
# with null values in the translated attributes.
dataset = tamr.datasets.by_resource_id(unified_dataset_id)
df = tbox.data_io.dataframe.from_dataset(
dataset, columns=attributes_to_translate, flatten_delimiter=" | "
)
# stream dictionary from Tamr. Dictionaries should follow the TranslationDictionary class of
# the toolbox: "standardized_phrase" (str), "translated_phrase" (str),
# "detected_language" (str), "original_phrases" (List[str])
dictionary_dataset = tamr.datasets.by_resource_id(dictionary_dataset_id)
dictionary = tbox.enrichment.dictionary.from_dataset(dictionary_dataset)
for column in df.columns:
LOGGER.info(f"Translating attribute: {column}")
dictionary = tbox.enrichment.translate.from_list(
all_phrases=df[column].unique().tolist(),
client=google,
dictionary=dictionary,
source_language="fr",
target_language="en",
)
# update dictionary on Tamr
dataset_name = tbox.enrichment.dictionary.to_dataset(
dictionary=dictionary, dataset=dictionary_dataset,
)
LOGGER.info(f"Tamr dataset {dataset_name} updated with new translation data")
LOGGER.info("Script complete.")
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/dataset.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
unified_dataset_id=CONFIG["datasets"]["my_mastering_project_dataset"]["id"],
dictionary_dataset_id=CONFIG["datasets"]["my_dictionary"]["id"],
)
Initiate a translation dictionary on Tamr¶
"""An example script to create an empty translation dictionary on Tamr"""
from typing import Dict, Any
import tamr_toolbox as tbox
import argparse
def main(
*,
instance_connection_info: Dict[str, Any],
dictionary_folder: str,
source_language: str,
target_language: str,
) -> None:
"""
Create an empty toolbox translation dictionary dataset on Tamr
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
dictionary_folder: Path to the folder on disk where local versions of dictionary are saved
source_language: Source language of the dictionary
target_language: Target language of the dictionary
Returns:
"""
# Connect to tamr
tamr = tbox.utils.client.create(**instance_connection_info)
# Create an empty dictionary on Tamr or load existing dictionary
LOGGER.info(
f"Initiating empty translation dictionary from source language {source_language} "
f"to target language {target_language}"
)
dictionary = tbox.enrichment.dictionary.load(
dictionary_folder=dictionary_folder,
target_language=target_language,
source_language=source_language,
)
if len(dictionary) > 0:
error_message = (
f"Warning: dictionary from {source_language} to {target_language} in "
f"{dictionary_folder} already exists and is not empty"
)
LOGGER.warning(error_message)
dataset_name = tbox.enrichment.dictionary.to_dataset(
dictionary=dictionary,
datasets_collection=tamr.datasets,
target_language=target_language,
source_language=source_language,
create_dataset=True,
)
LOGGER.info(f"{dataset_name} created as a source dataset on Tamr")
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
parser.add_argument("--source", help="source language to translate from", required=True)
parser.add_argument("--target", help="target language to translate to", required=True)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/dataset.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
dictionary_folder=CONFIG["translation"]["my_dictionary_folder"],
source_language=args.source,
target_language=args.target,
)