Projects

General

Add dataset to project and perform schema mapping

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""
Simple script to add a dataset and perform mappings via list of tuples or optionally bootstrap the
entire dataset.
Can be used for any project type that has a schema mapping element
(e.g. all of 'from tamr_toolbox.project.<mastering,categorization,schema_mapping> import schema'
will work)
"""
import tamr_toolbox as tbox
import click


@click.command()
@click.option("--config_file", help="the yaml config file used to set up tamr client")
@click.option("--project_name", help="the name of the project to which to add the dataset")
@click.option("--source_dataset_name", help="the name of the dataset to map")
@click.option(
    "--bootstrap", help="flag for whether or not to bootstrap the entire dataset", is_flag=True
)
@click.option(
    "--mappings",
    help="list of mappings to apply in format "
    "source_attr1,unified_attr1;source_attr2,unified_attr2",
    default="",
)
def main(
    config_file: str, project_name: str, source_dataset_name: str, bootstrap: bool, mappings: str
) -> None:
    """
    Add a Tamr dataset to a Tamr project and optionally bootstrap it or map it to the unified
    dataset following given attributes mapping

    Args:
        config_file: path to the config file containing server information
        project_name: name of the project to add a dataset to
        source_dataset_name: name of the dataset to add
        bootstrap: flag to boostrap the entire dataset to the unified dataset of the project
        mappings: mappings to use to map the source dataset to the unified dataset, mappings
            should follow the format "source_attr1,unified_attr1;source_attr2,unified_attr2"

    Returns:

    """

    # setup logger
    logger = tbox.utils.logger.create("my_logger")

    # get config and setup client
    config = tbox.utils.config.from_yaml(config_file)
    client = tbox.utils.client.create(**config["my_tamr_instance"])

    # grab project and source dataset
    project = client.projects.by_name(project_name)
    source_dataset = client.datasets.by_name(source_dataset_name)

    # if bootstrap then call bootstrap function with flag to add dataset to project if it
    # isn't already in
    if bootstrap:
        logger.info(f"bootstrapping dataset {source_dataset_name} in project {project_name}")
        tbox.project.mastering.schema.bootstrap_dataset(
            project, source_dataset=source_dataset, force_add_dataset_to_project=True
        )
        # if mappings is empty string we are done
        if mappings == "":
            logger.info("bootstrapped and mappings are empty so finishing")
            return None
    else:
        if mappings == "":
            logger.warning(
                "bootstrap not chosen but no mappings specified so exiting without doing anything"
            )
            return None

    # not bootstrap, manually add, and do mappings
    logger.info(
        f"bootstrap not chosen so manually adding {source_dataset_name} to project {project_name}"
    )
    project.add_input_dataset(source_dataset)
    # parse mapping tuples
    try:
        mapping_tuples = [(x.split(",")[0], x.split(",")[1]) for x in mappings.split(";")]
    except Exception as e:
        error_message = (
            f"Provided mappings do not follow the format "
            f"'source_attr1,unified_attr1;source_attr2,unified_attr2', error: {e}"
        )
        logger.error(error_message)
        raise RuntimeError(error_message)

    for (source_attr, unified_attr) in mapping_tuples:
        logger.debug(f"applying the following mapping: {source_attr} --> {unified_attr}")
        tbox.project.mastering.schema.map_attribute(
            project,
            source_attribute_name=source_attr,
            source_dataset_name=source_dataset.name,
            unified_attribute_name=unified_attr,
        )


if __name__ == "__main__":
    main()

Unmap datasets and remove from project

"""
Simple script to wholly unmap a dataset and remove it from a project
Can be used for any project type that has a schema mapping element
(e.g. all of 'from tamr_toolbox.project.<mastering,categorization,schema_mapping> import schema'
will work)
"""
import tamr_toolbox as tbox
import click


@click.command()
@click.option("--config_file", help="the yaml config file used to set up tamr client")
@click.option("--project_name", help="the name of the project from which to remove the dataset")
@click.option("--source_dataset_name", help="the name of the dataset to unmap/remove")
def main(config_file: str, project_name: str, source_dataset_name: str) -> None:
    """
    Unmap a dataset and remove it from a project
    Args:
        config_file: path to the config file containing server information
        project_name: name of the project to renove a dataset from
        source_dataset_name: name of the dataset to remove

    Returns:

    """
    # setup logger
    logger = tbox.utils.logger.create("my_logger")

    # get config and setup client
    config = tbox.utils.config.from_yaml(config_file)
    client = tbox.utils.client.create(**config["my_tamr_instance"])

    # get dataset and project
    source_dataset = client.datasets.by_name(source_dataset_name)
    project = client.projects.by_name(project_name)

    logger.info(
        f"unmapping and removing dataset {source_dataset_name} from project {project_name}"
    )
    tbox.project.mastering.schema.unmap_dataset(
        project, source_dataset=source_dataset, remove_dataset_from_project=True
    )


if __name__ == "__main__":
    main()

Categorization

Run Categorization Simple

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Categorization project without model training"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], categorization_project_id: str
) -> List[Operation]:
    """Runs the continuous steps (no training) of a categorization project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        categorization_project_id: The id of the target categorization project

    Returns: List of jobs run

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(categorization_project_id)
    my_project = my_project.as_categorization()

    # Run the typical steps of a project
    LOGGER.info(f"About to run project: {my_project.name}")
    operations = tbox.project.categorization.jobs.run(my_project, run_apply_feedback=False)
    LOGGER.info(f"Tasks for {my_project.name} complete")

    return operations


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        categorization_project_id=CONFIG["projects"]["my_categorization_project"],
    )

Run Categorization Step-By-Step

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Categorization project step-by-step including model training
"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], categorization_project_id: str
) -> List[Operation]:
    """Runs all steps of a categorization project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        categorization_project_id: The id of the target categorization project

    Returns: List of jobs run

    """
    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(categorization_project_id)
    my_project = my_project.as_categorization()

    # Run all steps of a project, step-by-step
    LOGGER.info(f"About to run project with training: {my_project.name}")

    op_list1 = tbox.project.categorization.jobs.update_unified_dataset(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list1]}")

    op_list2 = tbox.project.categorization.jobs.apply_feedback_and_update_results(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list2]}")

    LOGGER.info(f"All tasks for {my_project.name} complete")

    # Each step returns a list of operations
    # We return a combined list of all operation run in the script, in the order that they were run
    return [*op_list1, *op_list2]


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        categorization_project_id=CONFIG["projects"]["my_categorization_project"],
    )

Bootstrap a Categorization Model

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for bootstrapping a Tamr Categorization project with taxonomy categories"""
import argparse
import logging
import pandas as pd
import requests

from typing import Any, Dict, Optional
from tamr_unify_client.project.attribute_mapping.resource import AttributeMappingSpec

import tamr_toolbox as tbox
from tamr_toolbox.project.categorization import metrics

LOGGER = logging.getLogger(__name__)

# name of existing unified attribute to be compared against category names
UNIFIED_ATTRIBUTE_NAME = "description"


def main(
    *,
    instance_connection_info: Dict[str, Any],
    categorization_project_id: str,
    unified_attribute_name: str,
    category_tier: Optional[int] = None,
) -> None:
    """Bootstraps the model for a categorization projcets by adding the taxonomy as a separate
    source with training labels

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        categorization_project_id: The id of the target categorization project
        unified_attribute_name: The unified attribute to map the category names onto
        category_tier: Which tier of the taxonomy to confine labels to. Use -1 for leaf nodes.
            If not passed, all categories at all tiers will be used.

    Returns:
        Boolean indicating whether boostrap was successful or not

    Raises:
        TypeError: retrieved project is not a categorization project
        ValueError: retrieved project does not have an attribute of the specified name
    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    project = tamr_client.projects.by_resource_id(categorization_project_id).as_categorization()
    LOGGER.info(f"Retrieved project with name: {project.name}")

    # Validate dataset and attribute names
    # Confirm the target unified attribute exists
    try:
        project.attributes.by_name(unified_attribute_name)
    except requests.exceptions.HTTPError:
        raise RuntimeError(f"Project {project.name} has no attribute {unified_attribute_name}.")

    # Create a dataset with taxonomy categories
    dataset_name = f"{project.unified_dataset().name}_taxonomy_bootstrap_dataset"
    try:
        project.client.datasets.by_name(dataset_name)
    except KeyError:
        # Dataset with `dataset_name` does not exist in Tamr. Proceed with dataset creation.
        pass
    else:
        dataset_exists_error = (
            f"A dataset with name {dataset_name} already exists. Try again after deleting the "
            "dataset."
        )
        LOGGER.error(dataset_exists_error)
        raise RuntimeError(dataset_exists_error)

    # Proceed with dataset creation
    # Get the project taxonomy
    try:
        project.taxonomy()
    except requests.exceptions.RequestException:
        raise RuntimeError(f"Project {project.name} is not associated with any taxonomy yet.")
    LOGGER.info(f"Retrieved project taxonomy with name: {project.taxonomy().name}")

    # Bootstrap all available categories
    categories = project.taxonomy().categories()
    if category_tier is None:
        category_list = [category.path for category in categories]
    else:
        category_set = metrics._get_categories_at_tier(project=project, tier=category_tier)
        category_list = [category.split("|") for category in category_set]
        category_list.sort()

    # Create a dictionary of full path as a string to the leaf node name (used as label path)
    taxonomy_dict = {", ".join(category): category[-1] for category in category_list}

    # Create a dataframe
    df = pd.DataFrame(list(taxonomy_dict.items()), columns=["Category Path", "Category Name"])

    # Create a dataset in Tamr
    taxonomy_dataset = project.client.datasets.create_from_dataframe(
        df, primary_key_name="Category Path", dataset_name=dataset_name
    )
    LOGGER.info(f"Created a dataset in Tamr with name: {taxonomy_dataset.name}")

    # Add the dataset into the project
    project.add_input_dataset(taxonomy_dataset)
    LOGGER.info(f"Added {taxonomy_dataset.name} to project {project.name}")

    # Map category name attribute to new unified attribute
    attr_mapping_spec = (
        AttributeMappingSpec.new()
        .with_input_dataset_name(dataset_name)
        .with_input_attribute_name("Category Name")
        .with_unified_dataset_name(project.unified_dataset().name)
        .with_unified_attribute_name(unified_attribute_name)
    )
    project.attribute_mappings().create(attr_mapping_spec.to_dict())
    LOGGER.info(
        f"Created mapping from source attribute 'Category Name' to unified attribute "
        f"{unified_attribute_name}"
    )

    # Create transformation ensuring dataset tamr_id values match categorization path
    all_tx = tbox.project.schema_mapping.transformations.get_all(project)
    new_tx = (
        f"SELECT *, CASE WHEN origin_source_name = '{dataset_name}' THEN "
        f"concat(origin_source_name, '_', origin_entity_id) ELSE tamr_id END AS tamr_id;"
    )
    # Append so that it is applied after any other possibly conflicting transformations
    all_tx.unified_scope.append(new_tx)
    tbox.project.schema_mapping.transformations.set_all(project, all_tx)

    LOGGER.info("Updating the unified dataset...")
    tbox.project.categorization.jobs.update_unified_dataset(project)

    # Prepare and post labels
    labels_to_bootstrap = [
        {
            "action": "CREATE",
            "recordId": f"{dataset_name}_{key}",
            "record": {"verified": {"category": {"path": path}, "reason": "Taxonomy bootstrap"}},
        }
        for key, path in taxonomy_dict.items()
    ]
    project.client.post(
        f"projects/{project.resource_id}/categorizations/labels:updateRecords",
        json=labels_to_bootstrap,
    ).successful()
    LOGGER.info(f"Created and inserted labels into {project.name}")

    # Apply feedback and update results
    tbox.project.categorization.jobs.apply_feedback_and_update_results(project)
    LOGGER.info("Successfully applied and updated the model")
    LOGGER.info(f"Completed bootstrapping taxonomy in project {project.name}")


if __name__ == "__main__":

    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
    tbox.utils.logger.enable_toolbox_logging(log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        categorization_project_id=CONFIG["projects"]["my_categorization_project"],
        unified_attribute_name=UNIFIED_ATTRIBUTE_NAME,
    )

Obtain Average Confidence for a Specific Tier

"""Snippet for retrieving confidence metrics from a Tamr Categorization project"""
import tamr_toolbox as tbox
from tamr_toolbox.project.categorization.metrics import get_tier_confidence

# Read config, make Tamr Client, make logger
tamr = tbox.utils.client.create(username="user", password="pw", host="localhost")

# Get a Tamr categorization project by ID
my_project = tamr.projects.by_resource_id("my_project_id")

# By default gets the average confidence at leaf nodes without allowing dataset to refresh
leaf_node_confidence_dict = get_tier_confidence(my_project)

# Can allow the dataset to refresh if it is not streamable
# NOTE THIS WILL KICK OFF A <MATERIALIZE VIEWS> JOB
leaf_node_confidence_dict2 = get_tier_confidence(my_project, allow_dataset_refresh=True)

# Can also set the specific tier, which starts at 1
tier1_confidence_dict = get_tier_confidence(my_project, tier=1)

Mastering

Run Mastering Simple

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Mastering project without model training"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], mastering_project_id: str
) -> List[Operation]:
    """Runs the continuous steps (no training) of a mastering project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        mastering_project_id: The id of the target mastering project

    Returns: List of jobs run

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(mastering_project_id)
    my_project = my_project.as_mastering()

    # Run the typical steps of a project
    LOGGER.info(f"About to run project: {my_project.name}")
    operations = tbox.project.mastering.jobs.run(
        my_project, run_apply_feedback=False, run_estimate_pair_counts=False
    )
    LOGGER.info(f"Tasks for {my_project.name} complete")

    return operations


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        mastering_project_id=CONFIG["projects"]["my_mastering_project"],
    )

Run Mastering Step-By-Step

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Mastering project step-by-step including model training"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], mastering_project_id: str
) -> List[Operation]:
    """Runs all steps of a mastering project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        mastering_project_id: The id of the target mastering project

    Returns: List of jobs run

    """
    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(mastering_project_id)
    my_project = my_project.as_mastering()

    # Run all steps of a project, step-by-step
    LOGGER.info(f"About to run project with training: {my_project.name}")

    op_list1 = tbox.project.mastering.jobs.update_unified_dataset(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list1]}")

    op_list2 = tbox.project.mastering.jobs.estimate_pair_counts(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list2]}")

    op_list3 = tbox.project.mastering.jobs.generate_pairs(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list3]}")

    op_list4 = tbox.project.mastering.jobs.apply_feedback_and_update_results(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list4]}")

    op_list5 = tbox.project.mastering.jobs.publish_clusters(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list5]}")

    LOGGER.info(f"All tasks for {my_project.name} complete")

    # Each step returns a list of operations
    # We return a combined list of all operation run in the script, in the order that they were run
    return [*op_list1, *op_list2, *op_list3, *op_list4, *op_list5]


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        mastering_project_id=CONFIG["projects"]["my_mastering_project"],
    )

Golden Records

Run Golden Records Simple

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Golden Records project"""
import argparse
from typing import List, Dict

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, str], golden_records_project_id: str
) -> List[Operation]:
    """Runs the continuous steps of a golden records project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        golden_records_project_id: The id of the target golden records project

    Returns: List of jobs run

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(golden_records_project_id)

    # Run the typical steps of a project
    LOGGER.info(f"About to run project: {my_project.name}")
    operations = tbox.project.golden_records.jobs.run(my_project)
    LOGGER.info(f"Tasks for {my_project.name} complete")

    return operations


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        golden_records_project_id=CONFIG["projects"]["my_golden_records_project"],
    )

Run Golden Records Step-By-Step

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Golden Records project step-by-step"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], golden_records_project_id: str
) -> List[Operation]:
    """Runs all steps of a golden records project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        golden_records_project_id: The id of the target golden records project

    Returns: List of jobs run

    """
    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(golden_records_project_id)

    # Run all steps of a project, step-by-step
    LOGGER.info(f"About to run project with training: {my_project.name}")

    op_list1 = tbox.project.golden_records.jobs.update_golden_records(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list1]}")

    op_list2 = tbox.project.golden_records.jobs.publish_golden_records(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list2]}")

    LOGGER.info(f"All tasks for {my_project.name} complete")

    # Each step returns a list of operations
    # We return a combined list of all operation run in the script, in the order that they were run
    return [*op_list1, *op_list2]


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        golden_records_project_id=CONFIG["projects"]["my_golden_records_project"],
    )

Schema Mapping

Run Schema Mapping Simple

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Tamr Schema Mapping project"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], schema_mapping_project_id: str
) -> List[Operation]:
    """Runs the continuous steps of a schema mapping project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        schema_mapping_project_id: The id of the target schema mapping project

    Returns: List of jobs run

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(schema_mapping_project_id)

    # Run the typical steps of a project
    LOGGER.info(f"About to run project: {my_project.name}")
    operations = tbox.project.schema_mapping.jobs.run(my_project)
    LOGGER.info(f"Tasks for {my_project.name} complete")

    return operations


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        schema_mapping_project_id=CONFIG["projects"]["my_schema_mapping_project"],
    )

Run Schema Mapping Step-By-Step

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for running a Schema Mapping project step-by-step"""
import argparse
from typing import List, Dict, Any

from tamr_unify_client.operation import Operation

import tamr_toolbox as tbox


def main(
    *, instance_connection_info: Dict[str, Any], schema_mapping_project_id: str
) -> List[Operation]:
    """Runs all steps of a schema mapping project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        schema_mapping_project_id: The id of the target schema mapping project

    Returns: List of jobs run

    """
    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(schema_mapping_project_id)

    # Run all steps of a project, step-by-step
    LOGGER.info(f"About to run project: {my_project.name}")

    op_list1 = tbox.project.schema_mapping.jobs.update_unified_dataset(my_project)
    LOGGER.info(f"Completed: {[op.description for op in op_list1]}")

    LOGGER.info(f"All tasks for {my_project.name} complete")

    # Each step returns a list of operations
    # We return a combined list of all operation run in the script, in the order that they were run
    return [*op_list1]


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        schema_mapping_project_id=CONFIG["projects"]["my_schema_mapping_project"],
    )

Transformations

Edit Unified Transformations

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for creation and editing of transformations scoped to the unified dataset
in a Tamr project"""
from typing import Dict, Any

import tamr_toolbox as tbox
import argparse


def main(*, instance_connection_info: Dict[str, Any], project_id: str) -> None:
    """Edits unified transformations to a project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        project_id: The id of the target project

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(project_id)

    # Retrieve the unified transformations of a project
    LOGGER.info(f"Updating transformations for {my_project.name}")
    all_unified_tx = tbox.project.schema_mapping.transformations.get_all_unified(my_project)
    # this can also be called as tbox.project.mastering.transformations.get_all_unified
    # this can also be called as tbox.project.categorization.transformations.get_all_unified

    # View the transformations
    LOGGER.debug(all_unified_tx)
    # Example print output
    """
    [
    'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
    '//Just a comment',
    '//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;'
    ]
    """

    # We edit the object all_unified_tx locally.
    # Once it is in the final desired state we post it to Tamr

    # Delete the second to last transformation on the unified dataset
    all_unified_tx.pop(-2)

    # Append a transformation on the unified dataset
    all_unified_tx.append("SELECT *, lower(to_string(last_name)) as last_name;")

    # Insert a transformation as the second transformation on the unified dataset
    all_unified_tx.insert(1, "SELECT *, to_int(ssn) as ssn;")

    # View the transformations
    LOGGER.debug(all_unified_tx)
    # Example print output
    """
    [
    'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
    'SELECT *, to_int(ssn) as ssn;',
    '//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;',
    'SELECT *, lower(to_string(last_name)) as last_name;'
    ]
    """

    # Set the transformations on your Tamr project with the updated transformations
    tbox.project.schema_mapping.transformations.set_all_unified(my_project, all_unified_tx)
    # this can also be called as tbox.project.mastering.transformations.set_all_unified
    # this can also be called as tbox.project.categorization.transformations.set_all_unified

    LOGGER.info(f"Completed updating unified transformations for {my_project.name}")


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
    # Direct the loggers for tamr-toolbox and tamr-unify-client to the same file
    tbox.utils.logger.enable_package_logging("tamr_toolbox", log_directory=CONFIG["logging_dir"])
    tbox.utils.logger.enable_package_logging(
        "tamr_unify_client", log_directory=CONFIG["logging_dir"]
    )

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        project_id=CONFIG["projects"]["my_schema_mapping_project"],
    )

Edit Unified and Input Transformations

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

projects:
    my_mastering_project: "1"
    my_golden_records_project: "2"
    my_categorization_project: "3"
    my_schema_mapping_project: "4"
"""Example script for creation and editing of transformations of a Tamr project"""
from typing import Dict, Any

import tamr_toolbox as tbox
import argparse


def main(*, instance_connection_info: Dict[str, Any], project_id: str) -> None:
    """Edits transformations to a project

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        project_id: The id of the target project

    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    my_project = tamr_client.projects.by_resource_id(project_id)

    # Retrieve the transformations of a project
    LOGGER.info(f"Updating transformations for {my_project.name}")
    all_tx = tbox.project.schema_mapping.transformations.get_all(my_project)
    # this can also be called as tbox.project.mastering.transformations.get_all
    # this can also be called as tbox.project.categorization.transformations.get_all

    # View the transformations
    LOGGER.debug(all_tx)
    # Example print output
    """
    Transformations(
      input_scope=[
        InputTransformation(
          transformation='select *, lower(to_string(first_name)) as "first_name"',
          datasets=[
            tamr_unify_client.dataset.resource.Dataset(
              relative_id='datasets/3', name='people_tiny.csv', version='2')
              ]
          )
        ],
      unified_scope=[
        'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
        '//Just a comment',
        '//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;'
        ]
      )
    """

    # We edit the object all_tx locally. Once it is in the final desired state we post it to Tamr

    # Delete the second to last transformation on the unified dataset
    all_tx.unified_scope.pop(-2)

    # Append a transformation on the unified dataset
    all_tx.unified_scope.append("SELECT *, lower(to_string(last_name)) as last_name;")

    # Insert a transformation as the second transformation on the unified dataset
    all_tx.unified_scope.insert(1, "SELECT *, to_int(ssn) as ssn;")

    # Add a transformation as the first input transformation
    employee_dataset = my_project.input_datasets().by_name("employees_tiny.csv")
    new_input_tx = tbox.project.schema_mapping.transformations.InputTransformation(
        "SELECT *, to_int(ssn) as ssn;", [employee_dataset]
    )
    # this can also be called as tbox.project.mastering.transformations.InputTransformation
    # this can also be called as tbox.project.categorization.transformations.InputTransformation
    all_tx.input_scope.insert(0, new_input_tx)

    # View the transformations
    LOGGER.debug(all_tx)
    # Example print output
    """
    Transformations(
      input_scope=[
        InputTransformation(
          transformation='SELECT *, to_int(ssn) as ssn;',
          datasets=[
            tamr_unify_client.dataset.resource.Dataset(
              relative_id='datasets/64', name='employees_tiny.csv', version='162')
              ]
          ),
        InputTransformation(
          transformation='select *, lower(to_string(first_name)) as "first_name"',
          datasets=[
            tamr_unify_client.dataset.resource.Dataset(
              relative_id='datasets/3', name='people_tiny.csv', version='2')
              ]
          )
        ],
      unified_scope=[
        'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
        'SELECT *, to_int(ssn) as ssn;',
        '//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;',
        'SELECT *, lower(to_string(last_name)) as last_name;'
        ]
      )
    """

    # Set the transformations on your Tamr project with the updated transformations
    tbox.project.schema_mapping.transformations.set_all(my_project, all_tx)
    # this can also be called as tbox.project.mastering.transformations.set_all
    # this can also be called as tbox.project.categorization.transformations.set_all

    LOGGER.info(f"Completed updating transformations for {my_project.name}")


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
    # Direct the loggers for tamr-toolbox and tamr-unify-client to the same file
    tbox.utils.logger.enable_package_logging("tamr_toolbox", log_directory=CONFIG["logging_dir"])
    tbox.utils.logger.enable_package_logging(
        "tamr_unify_client", log_directory=CONFIG["logging_dir"]
    )

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        project_id=CONFIG["projects"]["my_schema_mapping_project"],
    )

Project Movement

Import Artifacts

"""Snippet for importing project artifacts into a Tamr project"""
import tamr_toolbox as tbox
from tamr_toolbox.project import import_artifacts

# Read config, make Tamr Client
tamr = tbox.utils.client.create(username="user", password="pw", host="localhost")

# Get project object (only necessary if importing into existing project)
project = tamr.projects.by_name("current_categorization_project")

# Set path to project artifact zip (on server containing tamr instance)
artifact_path = "/home/ubuntu/tamr/projectExports/minimal_categorization-1622067179477.zip"

# Import project artifacts into existing project
# (overwrite_existing flag is necessary for this operation)
op_1 = import_artifacts(
    project_artifact_path=str(artifact_path),
    tamr_client=tamr,
    target_project=project,
    overwrite_existing=True,
)

# Print operation
print(op_1)

# Import project artifacts into new project
op_2 = import_artifacts(
    project_artifact_path=str(artifact_path),
    tamr_client=tamr,
    new_project_name="new_categorization",
)

# Print operation
print(op_2)

Export Artifacts

"""Snippet for exporting project artifacts from a Tamr project"""
import tamr_toolbox as tbox
from tamr_toolbox.project import export_artifacts
from tamr_toolbox.models.project_artifacts import CategorizationArtifacts

# Read config, make Tamr Client
tamr = tbox.utils.client.create(username="user", password="pw", host="localhost")

# Get project object
project = tamr.projects.by_resource_id("my_project_id")

# Set path to export directory (on server containing tamr instance)
path_export_dir = "/home/ubuntu/tamr/projectExports"

# Make list of categorization artifacts to exclude.
# You can spell out the artifact code if known,
# or list access via the CategorizationArtifacts dataclass
exclude_list = [
    CategorizationArtifacts.CATEGORIZATION_VERIFIED_LABELS,
    "CATEGORIZATION_TAXONOMIES",
    CategorizationArtifacts.CATEGORIZATION_FEEDBACK,
]

# Export project artifacts
op = export_artifacts(
    project=project,
    artifact_directory_path=path_export_dir,
    exclude_artifacts=exclude_list,
    asynchronous=False,
)

# Print operation
print(op)

Fork Project

"""Example script for creating a copy of an existing project with a new name

To copy a project as "_copy"-postfixed name of the target project:
    python fork_project.py --export_path /path/to/export-file-dir/
                           --project_name <target_project_name>
                           --postfix _copy

To specify new project name explicitly:
    python fork_project.py --export_path /path/to/export-file-dir/
                           --project_name <target_project_name>
                           --new_name <new_project_name>
"""

import os
from pathlib import Path
from typing import Dict, Any
import argparse
import tamr_toolbox as tbox
from tamr_toolbox import utils as tbu
from tamr_toolbox.utils.client import Client
from tamr_toolbox.utils.operation import Operation


def export_from_tamr(client: Client, *, project_name: str, export_path: str,) -> Operation:
    """
    This function sets path for project artifacts export from Tamr and makes the call to execute
    the export action.
    Export path defaults to "project-movement/<project_name>" if no user-defined value is passed.py

    Args:
        client: an instance of Tamr unify client object
        project_name: name of the project to be exported from Tamr
        export_path: export path - must be accessible to the VM hosting Tamr

    Returns:
        operation for project export api call
    """
    project = client.projects.by_name(project_name)
    if not export_path:
        export_path = os.path.join(
            Path(__file__).resolve().parent, f"project-movement/{project_name}"
        )
    return tbox.project.export_artifacts(project=project, artifact_directory_path=export_path)


def main(
    *,
    project_name: str,
    new_name: str,
    postfix: str,
    new_ud_name: str,
    export_path: Path,
    overwrite: bool,
    instance_connection_info: Dict[str, Any],
):
    """
    This function creates a fork copy of a Tamr project by exporting the target project
    and importing it back to Tamr under a new name

    Args:
        project_name: name of the existing target project
        new_name: name of the forked project (Optional)
        postfix: if specified, will use and modify the target project name (Optional)
        new_ud_name: explicitly specify the name of unified dataset of forked project (Optional)
        export_path: export path - must be accessible to the VM hosting Tamr
        overwrite: flag to overwrite existing project artifacts
        instance_connection_info: Tamr instance & AUTH configs
    """

    tamr_client = tbu.client.create(**instance_connection_info)

    # calling the action functions:
    # exporting the target project from tamr
    LOGGER.info(f"Project {project_name} export from Tamr initializing...")
    op = export_from_tamr(tamr_client, project_name=project_name, export_path=export_path)
    tbu.operation.enforce_success(op)
    LOGGER.info(op)

    ## preparing for the import
    # finding the path to export file
    zipfile_name = [
        f
        for f in os.listdir(export_path)
        if (os.path.isfile(os.path.join(export_path, f)) and f.endswith(".zip"))
    ][0]
    zipfile_path = f"{export_path}/{zipfile_name}"

    # constructing the new project name and respective unified dataset name to be imported to tamr
    new_project_name = new_name if new_name else f"{project_name}{postfix}"
    new_ud_name = new_ud_name if new_ud_name else f"{new_project_name}_unified_dataset"
    # importing a copy of target project to tamr
    LOGGER.info(f"Project {new_project_name} import to Tamr initializing...")
    op = tbox.project.import_artifacts(
        tamr_client=tamr_client,
        project_artifact_path=zipfile_path,
        new_project_name=new_project_name,
        new_unified_dataset_name=new_ud_name,
        overwrite_existing=overwrite,
    )
    tbu.operation.enforce_success(op)
    LOGGER.info(op)
    LOGGER.info(
        f"Project {new_project_name} was successfully forked from Tamr project {project_name}!"
    )


if __name__ == "__main__":
    # parse args
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    parser.add_argument(
        "--project_name",
        default=None,
        required=True,
        help="raise to specify the name of target project",
    )
    parser.add_argument(
        "--export_path",
        default=None,
        required=True,
        help="raise to specify the path to export directory",
    )
    parser.add_argument(
        "--new_ud_name",
        default=None,
        help="raise to explicitly specify the name of forked project unified dataset (optional)",
    )
    parser.add_argument(
        "--overwrite",
        default=False,
        action="store_true",
        help="if raised will replace a project with specified target name (if one already exists)",
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        "--new_name", default=None, help="raise to explicitly specify the name of the new project",
    )
    group.add_argument(
        "--postfix",
        default=None,
        help="raise to imply the name of the new project by postfixing the original project name",
    )
    opts = parser.parse_args()

    # load config file and create tamr client
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=opts.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
    )
    # creating the logger object:
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
    # Let Tamr Toolbox itself also contribute to the log
    tbu.logger.enable_toolbox_logging(log_directory=CONFIG["logging_dir"], log_to_terminal=False)

    main(
        project_name=opts.project_name,
        new_name=opts.new_name,
        postfix=opts.postfix,
        new_ud_name=opts.new_ud_name,
        export_path=opts.export_path,
        overwrite=opts.overwrite,
        instance_connection_info=CONFIG["tamr"],
    )