Projects

General

Add dataset to project and perform schema mapping

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""
Simple script to add a dataset and perform mappings via list of tuples or optionally bootstrap the
entire dataset.
Can be used for any project type that has a schema mapping element
(e.g. all of 'from tamr_toolbox.project.<mastering,categorization,schema_mapping> import schema'
will work)
"""
import tamr_toolbox as tbox
import click


@click.command()
@click.option("--config_file", help="the yaml config file used to set up tamr client")
@click.option("--project_name", help="the name of the project to which to add the dataset")
@click.option("--source_dataset_name", help="the name of the dataset to map")
@click.option(
    "--bootstrap", help="flag for whether or not to bootstrap the entire dataset", is_flag=True
)
@click.option(
    "--mappings",
    help="list of mappings to apply in format "
    "source_attr1,unified_attr1;source_attr2,unified_attr2",
    default="",
)
def main(
    config_file: str, project_name: str, source_dataset_name: str, bootstrap: bool, mappings: str
) -> None:
    """
    Add a Tamr dataset to a Tamr project and optionally bootstrap it or map it to the unified
    dataset following given attributes mapping

    Args:
        config_file: path to the config file containing server information
        project_name: name of the project to add a dataset to
        source_dataset_name: name of the dataset to add
        bootstrap: flag to boostrap the entire dataset to the unified dataset of the project
        mappings: mappings to use to map the source dataset to the unified dataset, mappings
            should follow the format "source_attr1,unified_attr1;source_attr2,unified_attr2"

    Returns:

    """

    # setup logger
    logger = tbox.utils.logger.create("my_logger")

    # get config and setup client
    config = tbox.utils.config.from_yaml(config_file)
    client = tbox.utils.client.create(**config["my_tamr_instance"])

    # grab project and source dataset
    project = client.projects.by_name(project_name)
    source_dataset = client.datasets.by_name(source_dataset_name)

    # if bootstrap then call bootstrap function with flag to add dataset to project if it
    # isn't already in
    if bootstrap:
        logger.info(f"bootstrapping dataset {source_dataset_name} in project {project_name}")
        tbox.project.mastering.schema.bootstrap_dataset(
            project, source_dataset=source_dataset, force_add_dataset_to_project=True
        )
        # if mappings is empty string we are done
        if mappings == "":
            logger.info("bootstrapped and mappings are empty so finishing")
            return None
    else:
        if mappings == "":
            logger.warning(
                "bootstrap not chosen but no mappings specified so exiting without doing anything"
            )
            return None

    # not bootstrap, manually add, and do mappings
    logger.info(
        f"bootstrap not chosen so manually adding {source_dataset_name} to project {project_name}"
    )
    project.add_input_dataset(source_dataset)
    # parse mapping tuples
    try:
        mapping_tuples = [(x.split(",")[0], x.split(",")[1]) for x in mappings.split(";")]
    except Exception as e:
        error_message = (
            f"Provided mappings do not follow the format "
            f"'source_attr1,unified_attr1;source_attr2,unified_attr2', error: {e}"
        )
        logger.error(error_message)
        raise RuntimeError(error_message)

    for (source_attr, unified_attr) in mapping_tuples:
        logger.debug(f"applying the following mapping: {source_attr} --> {unified_attr}")
        tbox.project.mastering.schema.map_attribute(
            project,
            source_attribute_name=source_attr,
            source_dataset_name=source_dataset.name,
            unified_attribute_name=unified_attr,
        )


if __name__ == "__main__":
    main()

Unmap datasets and remove from project

"""
Simple script to wholly unmap a dataset and remove it from a project
Can be used for any project type that has a schema mapping element
(e.g. all of 'from tamr_toolbox.project.<mastering,categorization,schema_mapping> import schema'
will work)
"""
import tamr_toolbox as tbox
import click


@click.command()
@click.option("--config_file", help="the yaml config file used to set up tamr client")
@click.option("--project_name", help="the name of the project from which to remove the dataset")
@click.option("--source_dataset_name", help="the name of the dataset to unmap/remove")
def main(config_file: str, project_name: str, source_dataset_name: str) -> None:
    """
    Unmap a dataset and remove it from a project
    Args:
        config_file: path to the config file containing server information
        project_name: name of the project to renove a dataset from
        source_dataset_name: name of the dataset to remove

    Returns:

    """
    # setup logger
    logger = tbox.utils.logger.create("my_logger")

    # get config and setup client
    config = tbox.utils.config.from_yaml(config_file)
    client = tbox.utils.client.create(**config["my_tamr_instance"])

    # get dataset and project
    source_dataset = client.datasets.by_name(source_dataset_name)
    project = client.projects.by_name(project_name)

    logger.info(
        f"unmapping and removing dataset {source_dataset_name} from project {project_name}"
    )
    tbox.project.mastering.schema.unmap_dataset(
        project, source_dataset=source_dataset, remove_dataset_from_project=True
    )


if __name__ == "__main__":
    main()

Categorization

Run Categorization Simple

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Categorization project without model training"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], categorization_project_id: str
) -> List[Operation]:
    """Runs the continuous steps (no training) of a categorization project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        categorization_project_id: The id of the target categorization project

    Returns: List of jobs run

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(categorization_project_id)
    my_project = my_project.as_categorization()

    # Run the typical steps of a project
    LOGGER.info(f"About to run project: {my_project.name}")
    operations = tbox.project.categorization.jobs.run(my_project, run_apply_feedback=False)
    LOGGER.info(f"Tasks for {my_project.name} complete")

    return operations


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        categorization_project_id=CONFIG["projects"]["my_categorization_project"],
    )

Run Categorization Step-By-Step

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Categorization project step-by-step including model training
"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], categorization_project_id: str
) -> List[Operation]:
    """Runs all steps of a categorization project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        categorization_project_id: The id of the target categorization project

    Returns: List of jobs run

    """
    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(categorization_project_id)
    my_project = my_project.as_categorization()

    # Run all steps of a project, step-by-step
    LOGGER.info(f"About to run project with training: {my_project.name}")

    op_list1 = tbox.project.categorization.jobs.update_unified_dataset(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list1]}")

    op_list2 = tbox.project.categorization.jobs.apply_feedback_and_update_results(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list2]}")

    LOGGER.info(f"All tasks for {my_project.name} complete")

    # Each step returns a list of operations
    # We return a combined list of all operation run in the script, in the order that they were run
    return [*op_list1, *op_list2]


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        categorization_project_id=CONFIG["projects"]["my_categorization_project"],
    )

Obtain Average Confidence for a Specific Tier

"""Snippet for retrieving confidence metrics from a Tamr Categorization project"""
import tamr_toolbox as tbox
from tamr_toolbox.project.categorization.metrics import get_tier_confidence

# Read config, make Tamr Client, make logger
tamr = tbox.utils.client.create(username="user", password="pw", host="localhost")

# Get a Tamr categorization project by ID
my_project = tamr.projects.by_resource_id("my_project_id")

# By default gets the average confidence at leaf nodes without allowing dataset to refresh
leaf_node_confidence_dict = get_tier_confidence(my_project)

# Can allow the dataset to refresh if it is not streamable
# NOTE THIS WILL KICK OFF A <MATERIALIZE VIEWS> JOB
leaf_node_confidence_dict2 = get_tier_confidence(my_project, allow_dataset_refresh=True)

# Can also set the specific tier, which starts at 1
tier1_confidence_dict = get_tier_confidence(my_project, tier=1)

Mastering

Run Mastering Simple

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Mastering project without model training"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], mastering_project_id: str
) -> List[Operation]:
    """Runs the continuous steps (no training) of a mastering project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        mastering_project_id: The id of the target mastering project

    Returns: List of jobs run

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(mastering_project_id)
    my_project = my_project.as_mastering()

    # Run the typical steps of a project
    LOGGER.info(f"About to run project: {my_project.name}")
    operations = tbox.project.mastering.jobs.run(
        my_project, run_apply_feedback=False, run_estimate_pair_counts=False
    )
    LOGGER.info(f"Tasks for {my_project.name} complete")

    return operations


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        mastering_project_id=CONFIG["projects"]["my_mastering_project"],
    )

Run Mastering Step-By-Step

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Mastering project step-by-step including model training"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], mastering_project_id: str
) -> List[Operation]:
    """Runs all steps of a mastering project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        mastering_project_id: The id of the target mastering project

    Returns: List of jobs run

    """
    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(mastering_project_id)
    my_project = my_project.as_mastering()

    # Run all steps of a project, step-by-step
    LOGGER.info(f"About to run project with training: {my_project.name}")

    op_list1 = tbox.project.mastering.jobs.update_unified_dataset(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list1]}")

    op_list2 = tbox.project.mastering.jobs.estimate_pair_counts(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list2]}")

    op_list3 = tbox.project.mastering.jobs.generate_pairs(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list3]}")

    op_list4 = tbox.project.mastering.jobs.apply_feedback_and_update_results(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list4]}")

    op_list5 = tbox.project.mastering.jobs.publish_clusters(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list5]}")

    LOGGER.info(f"All tasks for {my_project.name} complete")

    # Each step returns a list of operations
    # We return a combined list of all operation run in the script, in the order that they were run
    return [*op_list1, *op_list2, *op_list3, *op_list4, *op_list5]


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        mastering_project_id=CONFIG["projects"]["my_mastering_project"],
    )

Golden Records

Run Golden Records Simple

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Golden Records project"""
import argparse
from typing import List, Dict

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, str], golden_records_project_id: str
) -> List[Operation]:
    """Runs the continuous steps of a golden records project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        golden_records_project_id: The id of the target golden records project

    Returns: List of jobs run

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(golden_records_project_id)

    # Run the typical steps of a project
    LOGGER.info(f"About to run project: {my_project.name}")
    operations = tbox.project.golden_records.jobs.run(my_project)
    LOGGER.info(f"Tasks for {my_project.name} complete")

    return operations


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        golden_records_project_id=CONFIG["projects"]["my_golden_records_project"],
    )

Run Golden Records Step-By-Step

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Golden Records project step-by-step"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], golden_records_project_id: str
) -> List[Operation]:
    """Runs all steps of a golden records project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        golden_records_project_id: The id of the target golden records project

    Returns: List of jobs run

    """
    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(golden_records_project_id)

    # Run all steps of a project, step-by-step
    LOGGER.info(f"About to run project with training: {my_project.name}")

    op_list1 = tbox.project.golden_records.jobs.update_golden_records(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list1]}")

    op_list2 = tbox.project.golden_records.jobs.publish_golden_records(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list2]}")

    LOGGER.info(f"All tasks for {my_project.name} complete")

    # Each step returns a list of operations
    # We return a combined list of all operation run in the script, in the order that they were run
    return [*op_list1, *op_list2]


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        golden_records_project_id=CONFIG["projects"]["my_golden_records_project"],
    )

Schema Mapping

Run Schema Mapping Simple

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Schema Mapping project"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], schema_mapping_project_id: str
) -> List[Operation]:
    """Runs the continuous steps of a schema mapping project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        schema_mapping_project_id: The id of the target schema mapping project

    Returns: List of jobs run

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(schema_mapping_project_id)

    # Run the typical steps of a project
    LOGGER.info(f"About to run project: {my_project.name}")
    operations = tbox.project.schema_mapping.jobs.run(my_project)
    LOGGER.info(f"Tasks for {my_project.name} complete")

    return operations


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        schema_mapping_project_id=CONFIG["projects"]["my_schema_mapping_project"],
    )

Run Schema Mapping Step-By-Step

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Schema Mapping project step-by-step"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], schema_mapping_project_id: str
) -> List[Operation]:
    """Runs all steps of a schema mapping project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        schema_mapping_project_id: The id of the target schema mapping project

    Returns: List of jobs run

    """
    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(schema_mapping_project_id)

    # Run all steps of a project, step-by-step
    LOGGER.info(f"About to run project: {my_project.name}")

    op_list1 = tbox.project.schema_mapping.jobs.update_unified_dataset(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list1]}")

    LOGGER.info(f"All tasks for {my_project.name} complete")

    # Each step returns a list of operations
    # We return a combined list of all operation run in the script, in the order that they were run
    return [*op_list1]


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        schema_mapping_project_id=CONFIG["projects"]["my_schema_mapping_project"],
    )

Transformations

Edit Unified Transformations

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for creation and editing of transformations scoped to the unified dataset
in a Tamr project"""
from typing import Dict, Any

import tamr_toolbox as tbox
import argparse


def main(*, instance_connection_info: Dict[str, Any], project_id: str) -> None:
    """Edits unified transformations to a project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        project_id: The id of the target project

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(project_id)

    # Retrieve the unified transformations of a project
    LOGGER.info(f"Updating transformations for {my_project.name}")
    all_unified_tx = tbox.project.schema_mapping.transformations.get_all_unified(my_project)
    # this can also be called as tbox.project.mastering.transformations.get_all_unified
    # this can also be called as tbox.project.categorization.transformations.get_all_unified

    # View the transformations
    LOGGER.debug(all_unified_tx)
    # Example print output
    """
    [
    'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
    '//Just a comment',
    '//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;'
    ]
    """

    # We edit the object all_unified_tx locally.
    # Once it is in the final desired state we post it to Tamr

    # Delete the second to last transformation on the unified dataset
    all_unified_tx.pop(-2)

    # Append a transformation on the unified dataset
    all_unified_tx.append("SELECT *, lower(to_string(last_name)) as last_name;")

    # Insert a transformation as the second transformation on the unified dataset
    all_unified_tx.insert(1, "SELECT *, to_int(ssn) as ssn;")

    # View the transformations
    LOGGER.debug(all_unified_tx)
    # Example print output
    """
    [
    'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
    'SELECT *, to_int(ssn) as ssn;',
    '//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;',
    'SELECT *, lower(to_string(last_name)) as last_name;'
    ]
    """

    # Set the transformations on your Tamr project with the updated transformations
    tbox.project.schema_mapping.transformations.set_all_unified(my_project, all_unified_tx)
    # this can also be called as tbox.project.mastering.transformations.set_all_unified
    # this can also be called as tbox.project.categorization.transformations.set_all_unified

    LOGGER.info(f"Completed updating unified transformations for {my_project.name}")


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
    # Direct the loggers for tamr-toolbox and tamr-unify-client to the same file
    tbox.utils.logger.enable_package_logging("tamr_toolbox", log_directory=CONFIG["logging_dir"])
    tbox.utils.logger.enable_package_logging(
        "tamr_unify_client", log_directory=CONFIG["logging_dir"]
    )

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        project_id=CONFIG["projects"]["my_schema_mapping_project"],
    )

Edit Unified and Input Transformations

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for creation and editing of transformations of a Tamr project"""
from typing import Dict, Any

import tamr_toolbox as tbox
import argparse


def main(*, instance_connection_info: Dict[str, Any], project_id: str) -> None:
    """Edits transformations to a project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        project_id: The id of the target project

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(project_id)

    # Retrieve the transformations of a project
    LOGGER.info(f"Updating transformations for {my_project.name}")
    all_tx = tbox.project.schema_mapping.transformations.get_all(my_project)
    # this can also be called as tbox.project.mastering.transformations.get_all
    # this can also be called as tbox.project.categorization.transformations.get_all

    # View the transformations
    LOGGER.debug(all_tx)
    # Example print output
    """
    Transformations(
      input_scope=[
        InputTransformation(
          transformation='select *, lower(to_string(first_name)) as "first_name"',
          datasets=[
            tamr_unify_client.dataset.resource.Dataset(
              relative_id='datasets/3', name='people_tiny.csv', version='2')
              ]
          )
        ],
      unified_scope=[
        'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
        '//Just a comment',
        '//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;'
        ]
      )
    """

    # We edit the object all_tx locally. Once it is in the final desired state we post it to Tamr

    # Delete the second to last transformation on the unified dataset
    all_tx.unified_scope.pop(-2)

    # Append a transformation on the unified dataset
    all_tx.unified_scope.append("SELECT *, lower(to_string(last_name)) as last_name;")

    # Insert a transformation as the second transformation on the unified dataset
    all_tx.unified_scope.insert(1, "SELECT *, to_int(ssn) as ssn;")

    # Add a transformation as the first input transformation
    employee_dataset = my_project.input_datasets().by_name("employees_tiny.csv")
    new_input_tx = tbox.project.schema_mapping.transformations.InputTransformation(
        "SELECT *, to_int(ssn) as ssn;", [employee_dataset]
    )
    # this can also be called as tbox.project.mastering.transformations.InputTransformation
    # this can also be called as tbox.project.categorization.transformations.InputTransformation
    all_tx.input_scope.insert(0, new_input_tx)

    # View the transformations
    LOGGER.debug(all_tx)
    # Example print output
    """
    Transformations(
      input_scope=[
        InputTransformation(
          transformation='SELECT *, to_int(ssn) as ssn;',
          datasets=[
            tamr_unify_client.dataset.resource.Dataset(
              relative_id='datasets/64', name='employees_tiny.csv', version='162')
              ]
          ),
        InputTransformation(
          transformation='select *, lower(to_string(first_name)) as "first_name"',
          datasets=[
            tamr_unify_client.dataset.resource.Dataset(
              relative_id='datasets/3', name='people_tiny.csv', version='2')
              ]
          )
        ],
      unified_scope=[
        'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
        'SELECT *, to_int(ssn) as ssn;',
        '//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;',
        'SELECT *, lower(to_string(last_name)) as last_name;'
        ]
      )
    """

    # Set the transformations on your Tamr project with the updated transformations
    tbox.project.schema_mapping.transformations.set_all(my_project, all_tx)
    # this can also be called as tbox.project.mastering.transformations.set_all
    # this can also be called as tbox.project.categorization.transformations.set_all

    LOGGER.info(f"Completed updating transformations for {my_project.name}")


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
    # Direct the loggers for tamr-toolbox and tamr-unify-client to the same file
    tbox.utils.logger.enable_package_logging("tamr_toolbox", log_directory=CONFIG["logging_dir"])
    tbox.utils.logger.enable_package_logging(
        "tamr_unify_client", log_directory=CONFIG["logging_dir"]
    )

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        project_id=CONFIG["projects"]["my_schema_mapping_project"],
    )