Projects¶
General¶
Add dataset to project and perform schema mapping¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""
Simple script to add a dataset and perform mappings via list of tuples or optionally bootstrap the
entire dataset.
Can be used for any project type that has a schema mapping element
(e.g. all of 'from tamr_toolbox.project.<mastering,categorization,schema_mapping> import schema'
will work)
"""
import tamr_toolbox as tbox
import click
@click.command()
@click.option("--config_file", help="the yaml config file used to set up tamr client")
@click.option("--project_name", help="the name of the project to which to add the dataset")
@click.option("--source_dataset_name", help="the name of the dataset to map")
@click.option(
"--bootstrap", help="flag for whether or not to bootstrap the entire dataset", is_flag=True
)
@click.option(
"--mappings",
help="list of mappings to apply in format "
"source_attr1,unified_attr1;source_attr2,unified_attr2",
default="",
)
def main(
config_file: str, project_name: str, source_dataset_name: str, bootstrap: bool, mappings: str
) -> None:
"""
Add a Tamr dataset to a Tamr project and optionally bootstrap it or map it to the unified
dataset following given attributes mapping
Args:
config_file: path to the config file containing server information
project_name: name of the project to add a dataset to
source_dataset_name: name of the dataset to add
bootstrap: flag to boostrap the entire dataset to the unified dataset of the project
mappings: mappings to use to map the source dataset to the unified dataset, mappings
should follow the format "source_attr1,unified_attr1;source_attr2,unified_attr2"
Returns:
"""
# setup logger
logger = tbox.utils.logger.create("my_logger")
# get config and setup client
config = tbox.utils.config.from_yaml(config_file)
client = tbox.utils.client.create(**config["my_tamr_instance"])
# grab project and source dataset
project = client.projects.by_name(project_name)
source_dataset = client.datasets.by_name(source_dataset_name)
# if bootstrap then call bootstrap function with flag to add dataset to project if it
# isn't already in
if bootstrap:
logger.info(f"bootstrapping dataset {source_dataset_name} in project {project_name}")
tbox.project.mastering.schema.bootstrap_dataset(
project, source_dataset=source_dataset, force_add_dataset_to_project=True
)
# if mappings is empty string we are done
if mappings == "":
logger.info("bootstrapped and mappings are empty so finishing")
return None
else:
if mappings == "":
logger.warning(
"bootstrap not chosen but no mappings specified so exiting without doing anything"
)
return None
# not bootstrap, manually add, and do mappings
logger.info(
f"bootstrap not chosen so manually adding {source_dataset_name} to project {project_name}"
)
project.add_input_dataset(source_dataset)
# parse mapping tuples
try:
mapping_tuples = [(x.split(",")[0], x.split(",")[1]) for x in mappings.split(";")]
except Exception as e:
error_message = (
f"Provided mappings do not follow the format "
f"'source_attr1,unified_attr1;source_attr2,unified_attr2', error: {e}"
)
logger.error(error_message)
raise RuntimeError(error_message)
for (source_attr, unified_attr) in mapping_tuples:
logger.debug(f"applying the following mapping: {source_attr} --> {unified_attr}")
tbox.project.mastering.schema.map_attribute(
project,
source_attribute_name=source_attr,
source_dataset_name=source_dataset.name,
unified_attribute_name=unified_attr,
)
if __name__ == "__main__":
main()
Unmap datasets and remove from project¶
"""
Simple script to wholly unmap a dataset and remove it from a project
Can be used for any project type that has a schema mapping element
(e.g. all of 'from tamr_toolbox.project.<mastering,categorization,schema_mapping> import schema'
will work)
"""
import tamr_toolbox as tbox
import click
@click.command()
@click.option("--config_file", help="the yaml config file used to set up tamr client")
@click.option("--project_name", help="the name of the project from which to remove the dataset")
@click.option("--source_dataset_name", help="the name of the dataset to unmap/remove")
def main(config_file: str, project_name: str, source_dataset_name: str) -> None:
"""
Unmap a dataset and remove it from a project
Args:
config_file: path to the config file containing server information
project_name: name of the project to renove a dataset from
source_dataset_name: name of the dataset to remove
Returns:
"""
# setup logger
logger = tbox.utils.logger.create("my_logger")
# get config and setup client
config = tbox.utils.config.from_yaml(config_file)
client = tbox.utils.client.create(**config["my_tamr_instance"])
# get dataset and project
source_dataset = client.datasets.by_name(source_dataset_name)
project = client.projects.by_name(project_name)
logger.info(
f"unmapping and removing dataset {source_dataset_name} from project {project_name}"
)
tbox.project.mastering.schema.unmap_dataset(
project, source_dataset=source_dataset, remove_dataset_from_project=True
)
if __name__ == "__main__":
main()
Categorization¶
Run Categorization Simple¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Categorization project without model training"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], categorization_project_id: str
) -> List[Operation]:
"""Runs the continuous steps (no training) of a categorization project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
categorization_project_id: The id of the target categorization project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(categorization_project_id)
my_project = my_project.as_categorization()
# Run the typical steps of a project
LOGGER.info(f"About to run project: {my_project.name}")
operations = tbox.project.categorization.jobs.run(my_project, run_apply_feedback=False)
LOGGER.info(f"Tasks for {my_project.name} complete")
return operations
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
categorization_project_id=CONFIG["projects"]["my_categorization_project"],
)
Run Categorization Step-By-Step¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Categorization project step-by-step including model training
"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], categorization_project_id: str
) -> List[Operation]:
"""Runs all steps of a categorization project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
categorization_project_id: The id of the target categorization project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(categorization_project_id)
my_project = my_project.as_categorization()
# Run all steps of a project, step-by-step
LOGGER.info(f"About to run project with training: {my_project.name}")
op_list1 = tbox.project.categorization.jobs.update_unified_dataset(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list1]}")
op_list2 = tbox.project.categorization.jobs.apply_feedback_and_update_results(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list2]}")
LOGGER.info(f"All tasks for {my_project.name} complete")
# Each step returns a list of operations
# We return a combined list of all operation run in the script, in the order that they were run
return [*op_list1, *op_list2]
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
categorization_project_id=CONFIG["projects"]["my_categorization_project"],
)
Obtain Average Confidence for a Specific Tier¶
"""Snippet for retrieving confidence metrics from a Tamr Categorization project"""
import tamr_toolbox as tbox
from tamr_toolbox.project.categorization.metrics import get_tier_confidence
# Read config, make Tamr Client, make logger
tamr = tbox.utils.client.create(username="user", password="pw", host="localhost")
# Get a Tamr categorization project by ID
my_project = tamr.projects.by_resource_id("my_project_id")
# By default gets the average confidence at leaf nodes without allowing dataset to refresh
leaf_node_confidence_dict = get_tier_confidence(my_project)
# Can allow the dataset to refresh if it is not streamable
# NOTE THIS WILL KICK OFF A <MATERIALIZE VIEWS> JOB
leaf_node_confidence_dict2 = get_tier_confidence(my_project, allow_dataset_refresh=True)
# Can also set the specific tier, which starts at 1
tier1_confidence_dict = get_tier_confidence(my_project, tier=1)
Mastering¶
Run Mastering Simple¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Mastering project without model training"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], mastering_project_id: str
) -> List[Operation]:
"""Runs the continuous steps (no training) of a mastering project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
mastering_project_id: The id of the target mastering project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(mastering_project_id)
my_project = my_project.as_mastering()
# Run the typical steps of a project
LOGGER.info(f"About to run project: {my_project.name}")
operations = tbox.project.mastering.jobs.run(
my_project, run_apply_feedback=False, run_estimate_pair_counts=False
)
LOGGER.info(f"Tasks for {my_project.name} complete")
return operations
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
mastering_project_id=CONFIG["projects"]["my_mastering_project"],
)
Run Mastering Step-By-Step¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Mastering project step-by-step including model training"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], mastering_project_id: str
) -> List[Operation]:
"""Runs all steps of a mastering project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
mastering_project_id: The id of the target mastering project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(mastering_project_id)
my_project = my_project.as_mastering()
# Run all steps of a project, step-by-step
LOGGER.info(f"About to run project with training: {my_project.name}")
op_list1 = tbox.project.mastering.jobs.update_unified_dataset(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list1]}")
op_list2 = tbox.project.mastering.jobs.estimate_pair_counts(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list2]}")
op_list3 = tbox.project.mastering.jobs.generate_pairs(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list3]}")
op_list4 = tbox.project.mastering.jobs.apply_feedback_and_update_results(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list4]}")
op_list5 = tbox.project.mastering.jobs.publish_clusters(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list5]}")
LOGGER.info(f"All tasks for {my_project.name} complete")
# Each step returns a list of operations
# We return a combined list of all operation run in the script, in the order that they were run
return [*op_list1, *op_list2, *op_list3, *op_list4, *op_list5]
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
mastering_project_id=CONFIG["projects"]["my_mastering_project"],
)
Golden Records¶
Run Golden Records Simple¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Golden Records project"""
import argparse
from typing import List, Dict
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, str], golden_records_project_id: str
) -> List[Operation]:
"""Runs the continuous steps of a golden records project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
golden_records_project_id: The id of the target golden records project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(golden_records_project_id)
# Run the typical steps of a project
LOGGER.info(f"About to run project: {my_project.name}")
operations = tbox.project.golden_records.jobs.run(my_project)
LOGGER.info(f"Tasks for {my_project.name} complete")
return operations
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
golden_records_project_id=CONFIG["projects"]["my_golden_records_project"],
)
Run Golden Records Step-By-Step¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Golden Records project step-by-step"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], golden_records_project_id: str
) -> List[Operation]:
"""Runs all steps of a golden records project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
golden_records_project_id: The id of the target golden records project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(golden_records_project_id)
# Run all steps of a project, step-by-step
LOGGER.info(f"About to run project with training: {my_project.name}")
op_list1 = tbox.project.golden_records.jobs.update_golden_records(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list1]}")
op_list2 = tbox.project.golden_records.jobs.publish_golden_records(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list2]}")
LOGGER.info(f"All tasks for {my_project.name} complete")
# Each step returns a list of operations
# We return a combined list of all operation run in the script, in the order that they were run
return [*op_list1, *op_list2]
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
golden_records_project_id=CONFIG["projects"]["my_golden_records_project"],
)
Schema Mapping¶
Run Schema Mapping Simple¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Schema Mapping project"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], schema_mapping_project_id: str
) -> List[Operation]:
"""Runs the continuous steps of a schema mapping project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
schema_mapping_project_id: The id of the target schema mapping project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(schema_mapping_project_id)
# Run the typical steps of a project
LOGGER.info(f"About to run project: {my_project.name}")
operations = tbox.project.schema_mapping.jobs.run(my_project)
LOGGER.info(f"Tasks for {my_project.name} complete")
return operations
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
schema_mapping_project_id=CONFIG["projects"]["my_schema_mapping_project"],
)
Run Schema Mapping Step-By-Step¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Schema Mapping project step-by-step"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], schema_mapping_project_id: str
) -> List[Operation]:
"""Runs all steps of a schema mapping project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
schema_mapping_project_id: The id of the target schema mapping project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(schema_mapping_project_id)
# Run all steps of a project, step-by-step
LOGGER.info(f"About to run project: {my_project.name}")
op_list1 = tbox.project.schema_mapping.jobs.update_unified_dataset(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list1]}")
LOGGER.info(f"All tasks for {my_project.name} complete")
# Each step returns a list of operations
# We return a combined list of all operation run in the script, in the order that they were run
return [*op_list1]
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
schema_mapping_project_id=CONFIG["projects"]["my_schema_mapping_project"],
)
Transformations¶
Edit Unified Transformations¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for creation and editing of transformations scoped to the unified dataset
in a Tamr project"""
from typing import Dict, Any
import tamr_toolbox as tbox
import argparse
def main(*, instance_connection_info: Dict[str, Any], project_id: str) -> None:
"""Edits unified transformations to a project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
project_id: The id of the target project
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(project_id)
# Retrieve the unified transformations of a project
LOGGER.info(f"Updating transformations for {my_project.name}")
all_unified_tx = tbox.project.schema_mapping.transformations.get_all_unified(my_project)
# this can also be called as tbox.project.mastering.transformations.get_all_unified
# this can also be called as tbox.project.categorization.transformations.get_all_unified
# View the transformations
LOGGER.debug(all_unified_tx)
# Example print output
"""
[
'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
'//Just a comment',
'//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;'
]
"""
# We edit the object all_unified_tx locally.
# Once it is in the final desired state we post it to Tamr
# Delete the second to last transformation on the unified dataset
all_unified_tx.pop(-2)
# Append a transformation on the unified dataset
all_unified_tx.append("SELECT *, lower(to_string(last_name)) as last_name;")
# Insert a transformation as the second transformation on the unified dataset
all_unified_tx.insert(1, "SELECT *, to_int(ssn) as ssn;")
# View the transformations
LOGGER.debug(all_unified_tx)
# Example print output
"""
[
'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
'SELECT *, to_int(ssn) as ssn;',
'//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;',
'SELECT *, lower(to_string(last_name)) as last_name;'
]
"""
# Set the transformations on your Tamr project with the updated transformations
tbox.project.schema_mapping.transformations.set_all_unified(my_project, all_unified_tx)
# this can also be called as tbox.project.mastering.transformations.set_all_unified
# this can also be called as tbox.project.categorization.transformations.set_all_unified
LOGGER.info(f"Completed updating unified transformations for {my_project.name}")
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Direct the loggers for tamr-toolbox and tamr-unify-client to the same file
tbox.utils.logger.enable_package_logging("tamr_toolbox", log_directory=CONFIG["logging_dir"])
tbox.utils.logger.enable_package_logging(
"tamr_unify_client", log_directory=CONFIG["logging_dir"]
)
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
project_id=CONFIG["projects"]["my_schema_mapping_project"],
)
Edit Unified and Input Transformations¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for creation and editing of transformations of a Tamr project"""
from typing import Dict, Any
import tamr_toolbox as tbox
import argparse
def main(*, instance_connection_info: Dict[str, Any], project_id: str) -> None:
"""Edits transformations to a project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
project_id: The id of the target project
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(project_id)
# Retrieve the transformations of a project
LOGGER.info(f"Updating transformations for {my_project.name}")
all_tx = tbox.project.schema_mapping.transformations.get_all(my_project)
# this can also be called as tbox.project.mastering.transformations.get_all
# this can also be called as tbox.project.categorization.transformations.get_all
# View the transformations
LOGGER.debug(all_tx)
# Example print output
"""
Transformations(
input_scope=[
InputTransformation(
transformation='select *, lower(to_string(first_name)) as "first_name"',
datasets=[
tamr_unify_client.dataset.resource.Dataset(
relative_id='datasets/3', name='people_tiny.csv', version='2')
]
)
],
unified_scope=[
'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
'//Just a comment',
'//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;'
]
)
"""
# We edit the object all_tx locally. Once it is in the final desired state we post it to Tamr
# Delete the second to last transformation on the unified dataset
all_tx.unified_scope.pop(-2)
# Append a transformation on the unified dataset
all_tx.unified_scope.append("SELECT *, lower(to_string(last_name)) as last_name;")
# Insert a transformation as the second transformation on the unified dataset
all_tx.unified_scope.insert(1, "SELECT *, to_int(ssn) as ssn;")
# Add a transformation as the first input transformation
employee_dataset = my_project.input_datasets().by_name("employees_tiny.csv")
new_input_tx = tbox.project.schema_mapping.transformations.InputTransformation(
"SELECT *, to_int(ssn) as ssn;", [employee_dataset]
)
# this can also be called as tbox.project.mastering.transformations.InputTransformation
# this can also be called as tbox.project.categorization.transformations.InputTransformation
all_tx.input_scope.insert(0, new_input_tx)
# View the transformations
LOGGER.debug(all_tx)
# Example print output
"""
Transformations(
input_scope=[
InputTransformation(
transformation='SELECT *, to_int(ssn) as ssn;',
datasets=[
tamr_unify_client.dataset.resource.Dataset(
relative_id='datasets/64', name='employees_tiny.csv', version='162')
]
),
InputTransformation(
transformation='select *, lower(to_string(first_name)) as "first_name"',
datasets=[
tamr_unify_client.dataset.resource.Dataset(
relative_id='datasets/3', name='people_tiny.csv', version='2')
]
)
],
unified_scope=[
'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
'SELECT *, to_int(ssn) as ssn;',
'//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;',
'SELECT *, lower(to_string(last_name)) as last_name;'
]
)
"""
# Set the transformations on your Tamr project with the updated transformations
tbox.project.schema_mapping.transformations.set_all(my_project, all_tx)
# this can also be called as tbox.project.mastering.transformations.set_all
# this can also be called as tbox.project.categorization.transformations.set_all
LOGGER.info(f"Completed updating transformations for {my_project.name}")
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Direct the loggers for tamr-toolbox and tamr-unify-client to the same file
tbox.utils.logger.enable_package_logging("tamr_toolbox", log_directory=CONFIG["logging_dir"])
tbox.utils.logger.enable_package_logging(
"tamr_unify_client", log_directory=CONFIG["logging_dir"]
)
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
project_id=CONFIG["projects"]["my_schema_mapping_project"],
)
Project Movement¶
Import Artifacts¶
"""Snippet for importing project artifacts into a Tamr project"""
import tamr_toolbox as tbox
from tamr_toolbox.project import import_artifacts
# Read config, make Tamr Client
tamr = tbox.utils.client.create(username="user", password="pw", host="localhost")
# Get project object (only necessary if importing into existing project)
project = tamr.projects.by_name("current_categorization_project")
# Set path to project artifact zip (on server containing tamr instance)
artifact_path = "/home/ubuntu/tamr/projectExports/minimal_categorization-1622067179477.zip"
# Import project artifacts into existing project
# (overwrite_existing flag is necessary for this operation)
op_1 = import_artifacts(
project_artifact_path=str(artifact_path),
tamr_client=tamr,
target_project=project,
overwrite_existing=True,
)
# Print operation
print(op_1)
# Import project artifacts into new project
op_2 = import_artifacts(
project_artifact_path=str(artifact_path),
tamr_client=tamr,
new_project_name="new_categorization",
)
# Print operation
print(op_2)
Export Artifacts¶
"""Snippet for exporting project artifacts from a Tamr project"""
import tamr_toolbox as tbox
from tamr_toolbox.project import export_artifacts
from tamr_toolbox.models.project_artifacts import CategorizationArtifacts
# Read config, make Tamr Client
tamr = tbox.utils.client.create(username="user", password="pw", host="localhost")
# Get project object
project = tamr.projects.by_resource_id("my_project_id")
# Set path to export directory (on server containing tamr instance)
path_export_dir = "/home/ubuntu/tamr/projectExports"
# Make list of categorization artifacts to exclude.
# You can spell out the artifact code if known,
# or list access via the CategorizationArtifacts dataclass
exclude_list = [
CategorizationArtifacts.CATEGORIZATION_VERIFIED_LABELS,
"CATEGORIZATION_TAXONOMIES",
CategorizationArtifacts.CATEGORIZATION_FEEDBACK,
]
# Export project artifacts
op = export_artifacts(
project=project,
artifact_directory_path=path_export_dir,
exclude_artifacts=exclude_list,
asynchronous=False,
)
# Print operation
print(op)