Projects¶
General¶
Add dataset to project and perform schema mapping¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""
Simple script to add a dataset and perform mappings via list of tuples or optionally bootstrap the
entire dataset.
Can be used for any project type that has a schema mapping element
(e.g. all of 'from tamr_toolbox.project.<mastering,categorization,schema_mapping> import schema'
will work)
"""
import tamr_toolbox as tbox
import click
@click.command()
@click.option("--config_file", help="the yaml config file used to set up tamr client")
@click.option("--project_name", help="the name of the project to which to add the dataset")
@click.option("--source_dataset_name", help="the name of the dataset to map")
@click.option(
"--bootstrap", help="flag for whether or not to bootstrap the entire dataset", is_flag=True
)
@click.option(
"--mappings",
help="list of mappings to apply in format "
"source_attr1,unified_attr1;source_attr2,unified_attr2",
default="",
)
def main(
config_file: str, project_name: str, source_dataset_name: str, bootstrap: bool, mappings: str
) -> None:
"""
Add a Tamr dataset to a Tamr project and optionally bootstrap it or map it to the unified
dataset following given attributes mapping
Args:
config_file: path to the config file containing server information
project_name: name of the project to add a dataset to
source_dataset_name: name of the dataset to add
bootstrap: flag to boostrap the entire dataset to the unified dataset of the project
mappings: mappings to use to map the source dataset to the unified dataset, mappings
should follow the format "source_attr1,unified_attr1;source_attr2,unified_attr2"
Returns:
"""
# setup logger
logger = tbox.utils.logger.create("my_logger")
# get config and setup client
config = tbox.utils.config.from_yaml(config_file)
client = tbox.utils.client.create(**config["my_tamr_instance"])
# grab project and source dataset
project = client.projects.by_name(project_name)
source_dataset = client.datasets.by_name(source_dataset_name)
# if bootstrap then call bootstrap function with flag to add dataset to project if it
# isn't already in
if bootstrap:
logger.info(f"bootstrapping dataset {source_dataset_name} in project {project_name}")
tbox.project.mastering.schema.bootstrap_dataset(
project, source_dataset=source_dataset, force_add_dataset_to_project=True
)
# if mappings is empty string we are done
if mappings == "":
logger.info("bootstrapped and mappings are empty so finishing")
return None
else:
if mappings == "":
logger.warning(
"bootstrap not chosen but no mappings specified so exiting without doing anything"
)
return None
# not bootstrap, manually add, and do mappings
logger.info(
f"bootstrap not chosen so manually adding {source_dataset_name} to project {project_name}"
)
project.add_input_dataset(source_dataset)
# parse mapping tuples
try:
mapping_tuples = [(x.split(",")[0], x.split(",")[1]) for x in mappings.split(";")]
except Exception as e:
error_message = (
f"Provided mappings do not follow the format "
f"'source_attr1,unified_attr1;source_attr2,unified_attr2', error: {e}"
)
logger.error(error_message)
raise RuntimeError(error_message)
for (source_attr, unified_attr) in mapping_tuples:
logger.debug(f"applying the following mapping: {source_attr} --> {unified_attr}")
tbox.project.mastering.schema.map_attribute(
project,
source_attribute_name=source_attr,
source_dataset_name=source_dataset.name,
unified_attribute_name=unified_attr,
)
if __name__ == "__main__":
main()
Unmap datasets and remove from project¶
"""
Simple script to wholly unmap a dataset and remove it from a project
Can be used for any project type that has a schema mapping element
(e.g. all of 'from tamr_toolbox.project.<mastering,categorization,schema_mapping> import schema'
will work)
"""
import tamr_toolbox as tbox
import click
@click.command()
@click.option("--config_file", help="the yaml config file used to set up tamr client")
@click.option("--project_name", help="the name of the project from which to remove the dataset")
@click.option("--source_dataset_name", help="the name of the dataset to unmap/remove")
def main(config_file: str, project_name: str, source_dataset_name: str) -> None:
"""
Unmap a dataset and remove it from a project
Args:
config_file: path to the config file containing server information
project_name: name of the project to renove a dataset from
source_dataset_name: name of the dataset to remove
Returns:
"""
# setup logger
logger = tbox.utils.logger.create("my_logger")
# get config and setup client
config = tbox.utils.config.from_yaml(config_file)
client = tbox.utils.client.create(**config["my_tamr_instance"])
# get dataset and project
source_dataset = client.datasets.by_name(source_dataset_name)
project = client.projects.by_name(project_name)
logger.info(
f"unmapping and removing dataset {source_dataset_name} from project {project_name}"
)
tbox.project.mastering.schema.unmap_dataset(
project, source_dataset=source_dataset, remove_dataset_from_project=True
)
if __name__ == "__main__":
main()
Categorization¶
Run Categorization Simple¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Categorization project without model training"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], categorization_project_id: str
) -> List[Operation]:
"""Runs the continuous steps (no training) of a categorization project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
categorization_project_id: The id of the target categorization project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(categorization_project_id)
my_project = my_project.as_categorization()
# Run the typical steps of a project
LOGGER.info(f"About to run project: {my_project.name}")
operations = tbox.project.categorization.jobs.run(my_project, run_apply_feedback=False)
LOGGER.info(f"Tasks for {my_project.name} complete")
return operations
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
categorization_project_id=CONFIG["projects"]["my_categorization_project"],
)
Run Categorization Step-By-Step¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Categorization project step-by-step including model training
"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], categorization_project_id: str
) -> List[Operation]:
"""Runs all steps of a categorization project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
categorization_project_id: The id of the target categorization project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(categorization_project_id)
my_project = my_project.as_categorization()
# Run all steps of a project, step-by-step
LOGGER.info(f"About to run project with training: {my_project.name}")
op_list1 = tbox.project.categorization.jobs.update_unified_dataset(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list1]}")
op_list2 = tbox.project.categorization.jobs.apply_feedback_and_update_results(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list2]}")
LOGGER.info(f"All tasks for {my_project.name} complete")
# Each step returns a list of operations
# We return a combined list of all operation run in the script, in the order that they were run
return [*op_list1, *op_list2]
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
categorization_project_id=CONFIG["projects"]["my_categorization_project"],
)
Bootstrap a Categorization Model¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for bootstrapping a Tamr Categorization project with taxonomy categories"""
import argparse
import logging
import pandas as pd
import requests
from typing import Any, Dict, Optional
from tamr_unify_client.project.attribute_mapping.resource import AttributeMappingSpec
import tamr_toolbox as tbox
from tamr_toolbox.project.categorization import metrics
LOGGER = logging.getLogger(__name__)
# name of existing unified attribute to be compared against category names
UNIFIED_ATTRIBUTE_NAME = "description"
def main(
*,
instance_connection_info: Dict[str, Any],
categorization_project_id: str,
unified_attribute_name: str,
category_tier: Optional[int] = None,
) -> None:
"""Bootstraps the model for a categorization projcets by adding the taxonomy as a separate
source with training labels
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
categorization_project_id: The id of the target categorization project
unified_attribute_name: The unified attribute to map the category names onto
category_tier: Which tier of the taxonomy to confine labels to. Use -1 for leaf nodes.
If not passed, all categories at all tiers will be used.
Returns:
Boolean indicating whether boostrap was successful or not
Raises:
TypeError: retrieved project is not a categorization project
ValueError: retrieved project does not have an attribute of the specified name
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
project = tamr_client.projects.by_resource_id(categorization_project_id).as_categorization()
LOGGER.info(f"Retrieved project with name: {project.name}")
# Validate dataset and attribute names
# Confirm the target unified attribute exists
try:
project.attributes.by_name(unified_attribute_name)
except requests.exceptions.HTTPError:
raise RuntimeError(f"Project {project.name} has no attribute {unified_attribute_name}.")
# Create a dataset with taxonomy categories
dataset_name = f"{project.unified_dataset().name}_taxonomy_bootstrap_dataset"
try:
project.client.datasets.by_name(dataset_name)
except KeyError:
# Dataset with `dataset_name` does not exist in Tamr. Proceed with dataset creation.
pass
else:
dataset_exists_error = (
f"A dataset with name {dataset_name} already exists. Try again after deleting the "
"dataset."
)
LOGGER.error(dataset_exists_error)
raise RuntimeError(dataset_exists_error)
# Proceed with dataset creation
# Get the project taxonomy
try:
project.taxonomy()
except requests.exceptions.RequestException:
raise RuntimeError(f"Project {project.name} is not associated with any taxonomy yet.")
LOGGER.info(f"Retrieved project taxonomy with name: {project.taxonomy().name}")
# Bootstrap all available categories
categories = project.taxonomy().categories()
if category_tier is None:
category_list = [category.path for category in categories]
else:
category_set = metrics._get_categories_at_tier(project=project, tier=category_tier)
category_list = [category.split("|") for category in category_set]
category_list.sort()
# Create a dictionary of full path as a string to the leaf node name (used as label path)
taxonomy_dict = {", ".join(category): category[-1] for category in category_list}
# Create a dataframe
df = pd.DataFrame(list(taxonomy_dict.items()), columns=["Category Path", "Category Name"])
# Create a dataset in Tamr
taxonomy_dataset = project.client.datasets.create_from_dataframe(
df, primary_key_name="Category Path", dataset_name=dataset_name
)
LOGGER.info(f"Created a dataset in Tamr with name: {taxonomy_dataset.name}")
# Add the dataset into the project
project.add_input_dataset(taxonomy_dataset)
LOGGER.info(f"Added {taxonomy_dataset.name} to project {project.name}")
# Map category name attribute to new unified attribute
attr_mapping_spec = (
AttributeMappingSpec.new()
.with_input_dataset_name(dataset_name)
.with_input_attribute_name("Category Name")
.with_unified_dataset_name(project.unified_dataset().name)
.with_unified_attribute_name(unified_attribute_name)
)
project.attribute_mappings().create(attr_mapping_spec.to_dict())
LOGGER.info(
f"Created mapping from source attribute 'Category Name' to unified attribute "
f"{unified_attribute_name}"
)
# Create transformation ensuring dataset tamr_id values match categorization path
all_tx = tbox.project.schema_mapping.transformations.get_all(project)
new_tx = (
f"SELECT *, CASE WHEN origin_source_name = '{dataset_name}' THEN "
f"concat(origin_source_name, '_', origin_entity_id) ELSE tamr_id END AS tamr_id;"
)
# Append so that it is applied after any other possibly conflicting transformations
all_tx.unified_scope.append(new_tx)
tbox.project.schema_mapping.transformations.set_all(project, all_tx)
LOGGER.info("Updating the unified dataset...")
tbox.project.categorization.jobs.update_unified_dataset(project)
# Prepare and post labels
labels_to_bootstrap = [
{
"action": "CREATE",
"recordId": f"{dataset_name}_{key}",
"record": {"verified": {"category": {"path": path}, "reason": "Taxonomy bootstrap"}},
}
for key, path in taxonomy_dict.items()
]
project.client.post(
f"projects/{project.resource_id}/categorizations/labels:updateRecords",
json=labels_to_bootstrap,
).successful()
LOGGER.info(f"Created and inserted labels into {project.name}")
# Apply feedback and update results
tbox.project.categorization.jobs.apply_feedback_and_update_results(project)
LOGGER.info("Successfully applied and updated the model")
LOGGER.info(f"Completed bootstrapping taxonomy in project {project.name}")
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
tbox.utils.logger.enable_toolbox_logging(log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
categorization_project_id=CONFIG["projects"]["my_categorization_project"],
unified_attribute_name=UNIFIED_ATTRIBUTE_NAME,
)
Obtain Average Confidence for a Specific Tier¶
"""Snippet for retrieving confidence metrics from a Tamr Categorization project"""
import tamr_toolbox as tbox
from tamr_toolbox.project.categorization.metrics import get_tier_confidence
# Read config, make Tamr Client, make logger
tamr = tbox.utils.client.create(username="user", password="pw", host="localhost")
# Get a Tamr categorization project by ID
my_project = tamr.projects.by_resource_id("my_project_id")
# By default gets the average confidence at leaf nodes without allowing dataset to refresh
leaf_node_confidence_dict = get_tier_confidence(my_project)
# Can allow the dataset to refresh if it is not streamable
# NOTE THIS WILL KICK OFF A <MATERIALIZE VIEWS> JOB
leaf_node_confidence_dict2 = get_tier_confidence(my_project, allow_dataset_refresh=True)
# Can also set the specific tier, which starts at 1
tier1_confidence_dict = get_tier_confidence(my_project, tier=1)
Make changes to taxonomy¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""
This script provides an example of how to make changes to the taxonomy of a categorization project
without losing the verified categories already provided. Suppose we had the following taxonomy:
root
└── Animal & Pet Supplies
├── Cat Supplies
├── Crocodile Supplies
└── Dog Supplies
And we wished to edit this taxonomy to the following instead:
root
└── Supplies
├── Animal Supplies
│ └── Crocodile Supplies
└── Pet Supplies
├── Cat Supplies
└── Dog Supplies
This example makes the changes by doing the following in order:
1. Creating a new node "Supplies".
2. Moving node "Animal & Pet Supplies" to the node "Supplies -> Pet Supplies"
3. Creating a new node "Supplies -> Animal Supplies"
3. Moving "Supplies -> Pet Supplies -> Crocodile Supplies" to "Supplies -> Animal Supplies ->
Crocodile Supplies"
For larger scale changes, the required paths can also be provided programmatically.
"""
import argparse
from typing import Dict, Any
import tamr_toolbox as tbox
def main(*, instance_connection_info: Dict[str, Any], categorization_project_id: str) -> None:
"""Loads the taxonomy of a categorization project and creates a node, renames it, deletes it
and moves a node to another location.
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
categorization_project_id: The id of the target categorization project
Returns: None
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Create a new node "Supplies":
new_path = ["Supplies"]
LOGGER.info(f"Creating new node {new_path}")
tbox.project.categorization.taxonomy.create_node(
tamr_client, categorization_project_id, new_path
)
# Move node "Animal & Pet Supplies" to the node "Supplies -> Pet Supplies":
old_path = ["Animal & Pet Supplies"]
new_path = ["Supplies", "Pet Supplies"]
LOGGER.info(f"Moving node {old_path} to {new_path}")
tbox.project.categorization.taxonomy.move_node(
tamr_client, categorization_project_id, old_path, new_path
)
# Create a new node "Supplies -> Animal Supplies"
new_path = ["Supplies", "Animal Supplies"]
LOGGER.info(f"Creating node {new_path}")
tbox.project.categorization.taxonomy.create_node(
tamr_client, categorization_project_id, new_path
)
# Move "Supplies -> Pet Supplies -> Crocodile Supplies" to "Supplies -> Animal Supplies ->
# Crocodile Supplies":
new_path = ["Supplies", "Animal Supplies", "Crocodile Supplies"]
node_to_move = ["Supplies", "Pet Supplies", "Crocodile Supplies"]
LOGGER.info(f"Moving node {node_to_move} to be under {new_path}")
tbox.project.categorization.taxonomy.move_node(
tamr_client, categorization_project_id, node_to_move, new_path
)
return
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
categorization_project_id=CONFIG["projects"]["my_categorization_project"],
)
Mastering¶
Run Mastering Simple¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Mastering project without model training"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], mastering_project_id: str
) -> List[Operation]:
"""Runs the continuous steps (no training) of a mastering project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
mastering_project_id: The id of the target mastering project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(mastering_project_id)
my_project = my_project.as_mastering()
# Run the typical steps of a project
LOGGER.info(f"About to run project: {my_project.name}")
operations = tbox.project.mastering.jobs.run(
my_project, run_apply_feedback=False, run_estimate_pair_counts=False
)
LOGGER.info(f"Tasks for {my_project.name} complete")
return operations
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
mastering_project_id=CONFIG["projects"]["my_mastering_project"],
)
Run Mastering Step-By-Step¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Mastering project step-by-step including model training"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], mastering_project_id: str
) -> List[Operation]:
"""Runs all steps of a mastering project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
mastering_project_id: The id of the target mastering project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(mastering_project_id)
my_project = my_project.as_mastering()
# Run all steps of a project, step-by-step
LOGGER.info(f"About to run project with training: {my_project.name}")
op_list1 = tbox.project.mastering.jobs.update_unified_dataset(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list1]}")
op_list2 = tbox.project.mastering.jobs.estimate_pair_counts(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list2]}")
op_list3 = tbox.project.mastering.jobs.generate_pairs(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list3]}")
op_list4 = tbox.project.mastering.jobs.apply_feedback_and_update_results(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list4]}")
op_list5 = tbox.project.mastering.jobs.publish_clusters(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list5]}")
LOGGER.info(f"All tasks for {my_project.name} complete")
# Each step returns a list of operations
# We return a combined list of all operation run in the script, in the order that they were run
return [*op_list1, *op_list2, *op_list3, *op_list4, *op_list5]
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
mastering_project_id=CONFIG["projects"]["my_mastering_project"],
)
Golden Records¶
Run Golden Records Simple¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Golden Records project"""
import argparse
from typing import List, Dict
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, str], golden_records_project_id: str
) -> List[Operation]:
"""Runs the continuous steps of a golden records project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
golden_records_project_id: The id of the target golden records project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(golden_records_project_id)
# Run the typical steps of a project
LOGGER.info(f"About to run project: {my_project.name}")
operations = tbox.project.golden_records.jobs.run(my_project)
LOGGER.info(f"Tasks for {my_project.name} complete")
return operations
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
golden_records_project_id=CONFIG["projects"]["my_golden_records_project"],
)
Run Golden Records Step-By-Step¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Golden Records project step-by-step"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], golden_records_project_id: str
) -> List[Operation]:
"""Runs all steps of a golden records project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
golden_records_project_id: The id of the target golden records project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(golden_records_project_id)
# Run all steps of a project, step-by-step
LOGGER.info(f"About to run project with training: {my_project.name}")
op_list1 = tbox.project.golden_records.jobs.update_golden_records(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list1]}")
op_list2 = tbox.project.golden_records.jobs.publish_golden_records(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list2]}")
LOGGER.info(f"All tasks for {my_project.name} complete")
# Each step returns a list of operations
# We return a combined list of all operation run in the script, in the order that they were run
return [*op_list1, *op_list2]
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
golden_records_project_id=CONFIG["projects"]["my_golden_records_project"],
)
Schema Mapping¶
Run Schema Mapping Simple¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Tamr Schema Mapping project"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], schema_mapping_project_id: str
) -> List[Operation]:
"""Runs the continuous steps of a schema mapping project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
schema_mapping_project_id: The id of the target schema mapping project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(schema_mapping_project_id)
# Run the typical steps of a project
LOGGER.info(f"About to run project: {my_project.name}")
operations = tbox.project.schema_mapping.jobs.run(my_project)
LOGGER.info(f"Tasks for {my_project.name} complete")
return operations
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
schema_mapping_project_id=CONFIG["projects"]["my_schema_mapping_project"],
)
Run Schema Mapping Step-By-Step¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for running a Schema Mapping project step-by-step"""
import argparse
from typing import List, Dict, Any
from tamr_unify_client.operation import Operation
import tamr_toolbox as tbox
def main(
*, instance_connection_info: Dict[str, Any], schema_mapping_project_id: str
) -> List[Operation]:
"""Runs all steps of a schema mapping project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
schema_mapping_project_id: The id of the target schema mapping project
Returns: List of jobs run
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(schema_mapping_project_id)
# Run all steps of a project, step-by-step
LOGGER.info(f"About to run project: {my_project.name}")
op_list1 = tbox.project.schema_mapping.jobs.update_unified_dataset(my_project)
LOGGER.info(f"Completed: {[op.description for op in op_list1]}")
LOGGER.info(f"All tasks for {my_project.name} complete")
# Each step returns a list of operations
# We return a combined list of all operation run in the script, in the order that they were run
return [*op_list1]
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
schema_mapping_project_id=CONFIG["projects"]["my_schema_mapping_project"],
)
Transformations¶
Edit Unified Transformations¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for creation and editing of transformations scoped to the unified dataset
in a Tamr project"""
from typing import Dict, Any
import tamr_toolbox as tbox
import argparse
def main(*, instance_connection_info: Dict[str, Any], project_id: str) -> None:
"""Edits unified transformations to a project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
project_id: The id of the target project
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(project_id)
# Retrieve the unified transformations of a project
LOGGER.info(f"Updating transformations for {my_project.name}")
all_unified_tx = tbox.project.schema_mapping.transformations.get_all_unified(my_project)
# this can also be called as tbox.project.mastering.transformations.get_all_unified
# this can also be called as tbox.project.categorization.transformations.get_all_unified
# View the transformations
LOGGER.debug(all_unified_tx)
# Example print output
"""
[
'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
'//Just a comment',
'//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;'
]
"""
# We edit the object all_unified_tx locally.
# Once it is in the final desired state we post it to Tamr
# Delete the second to last transformation on the unified dataset
all_unified_tx.pop(-2)
# Append a transformation on the unified dataset
all_unified_tx.append("SELECT *, lower(to_string(last_name)) as last_name;")
# Insert a transformation as the second transformation on the unified dataset
all_unified_tx.insert(1, "SELECT *, to_int(ssn) as ssn;")
# View the transformations
LOGGER.debug(all_unified_tx)
# Example print output
"""
[
'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
'SELECT *, to_int(ssn) as ssn;',
'//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;',
'SELECT *, lower(to_string(last_name)) as last_name;'
]
"""
# Set the transformations on your Tamr project with the updated transformations
tbox.project.schema_mapping.transformations.set_all_unified(my_project, all_unified_tx)
# this can also be called as tbox.project.mastering.transformations.set_all_unified
# this can also be called as tbox.project.categorization.transformations.set_all_unified
LOGGER.info(f"Completed updating unified transformations for {my_project.name}")
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Direct the loggers for tamr-toolbox and tamr-unify-client to the same file
tbox.utils.logger.enable_package_logging("tamr_toolbox", log_directory=CONFIG["logging_dir"])
tbox.utils.logger.enable_package_logging(
"tamr_unify_client", log_directory=CONFIG["logging_dir"]
)
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
project_id=CONFIG["projects"]["my_schema_mapping_project"],
)
Edit Unified and Input Transformations¶
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
projects:
my_mastering_project: "1"
my_golden_records_project: "2"
my_categorization_project: "3"
my_schema_mapping_project: "4"
"""Example script for creation and editing of transformations of a Tamr project"""
from typing import Dict, Any
import tamr_toolbox as tbox
import argparse
def main(*, instance_connection_info: Dict[str, Any], project_id: str) -> None:
"""Edits transformations to a project
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
project_id: The id of the target project
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
# Retrieve the project
my_project = tamr_client.projects.by_resource_id(project_id)
# Retrieve the transformations of a project
LOGGER.info(f"Updating transformations for {my_project.name}")
all_tx = tbox.project.schema_mapping.transformations.get_all(my_project)
# this can also be called as tbox.project.mastering.transformations.get_all
# this can also be called as tbox.project.categorization.transformations.get_all
# View the transformations
LOGGER.debug(all_tx)
# Example print output
"""
Transformations(
input_scope=[
InputTransformation(
transformation='select *, lower(to_string(first_name)) as "first_name"',
datasets=[
tamr_unify_client.dataset.resource.Dataset(
relative_id='datasets/3', name='people_tiny.csv', version='2')
]
)
],
unified_scope=[
'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
'//Just a comment',
'//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;'
]
)
"""
# We edit the object all_tx locally. Once it is in the final desired state we post it to Tamr
# Delete the second to last transformation on the unified dataset
all_tx.unified_scope.pop(-2)
# Append a transformation on the unified dataset
all_tx.unified_scope.append("SELECT *, lower(to_string(last_name)) as last_name;")
# Insert a transformation as the second transformation on the unified dataset
all_tx.unified_scope.insert(1, "SELECT *, to_int(ssn) as ssn;")
# Add a transformation as the first input transformation
employee_dataset = my_project.input_datasets().by_name("employees_tiny.csv")
new_input_tx = tbox.project.schema_mapping.transformations.InputTransformation(
"SELECT *, to_int(ssn) as ssn;", [employee_dataset]
)
# this can also be called as tbox.project.mastering.transformations.InputTransformation
# this can also be called as tbox.project.categorization.transformations.InputTransformation
all_tx.input_scope.insert(0, new_input_tx)
# View the transformations
LOGGER.debug(all_tx)
# Example print output
"""
Transformations(
input_scope=[
InputTransformation(
transformation='SELECT *, to_int(ssn) as ssn;',
datasets=[
tamr_unify_client.dataset.resource.Dataset(
relative_id='datasets/64', name='employees_tiny.csv', version='162')
]
),
InputTransformation(
transformation='select *, lower(to_string(first_name)) as "first_name"',
datasets=[
tamr_unify_client.dataset.resource.Dataset(
relative_id='datasets/3', name='people_tiny.csv', version='2')
]
)
],
unified_scope=[
'SELECT *, concat(to_string(first_name), \' \', to_string(last_name)) as "full_name";',
'SELECT *, to_int(ssn) as ssn;',
'//Example comment\nSELECT *, filter(is_not_empty, all_names) AS all_names;',
'SELECT *, lower(to_string(last_name)) as last_name;'
]
)
"""
# Set the transformations on your Tamr project with the updated transformations
tbox.project.schema_mapping.transformations.set_all(my_project, all_tx)
# this can also be called as tbox.project.mastering.transformations.set_all
# this can also be called as tbox.project.categorization.transformations.set_all
LOGGER.info(f"Completed updating transformations for {my_project.name}")
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Direct the loggers for tamr-toolbox and tamr-unify-client to the same file
tbox.utils.logger.enable_package_logging("tamr_toolbox", log_directory=CONFIG["logging_dir"])
tbox.utils.logger.enable_package_logging(
"tamr_unify_client", log_directory=CONFIG["logging_dir"]
)
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
project_id=CONFIG["projects"]["my_schema_mapping_project"],
)
Project Movement¶
Import Artifacts¶
"""Snippet for importing project artifacts into a Tamr project"""
import tamr_toolbox as tbox
from tamr_toolbox.project import import_artifacts
# Read config, make Tamr Client
tamr = tbox.utils.client.create(username="user", password="pw", host="localhost")
# Get project object (only necessary if importing into existing project)
project = tamr.projects.by_name("current_categorization_project")
# Set path to project artifact zip (on server containing tamr instance)
artifact_path = "/home/ubuntu/tamr/projectExports/minimal_categorization-1622067179477.zip"
# Import project artifacts into existing project
# (overwrite_existing flag is necessary for this operation)
op_1 = import_artifacts(
project_artifact_path=str(artifact_path),
tamr_client=tamr,
target_project=project,
overwrite_existing=True,
)
# Print operation
print(op_1)
# Import project artifacts into new project
op_2 = import_artifacts(
project_artifact_path=str(artifact_path),
tamr_client=tamr,
new_project_name="new_categorization",
)
# Print operation
print(op_2)
Export Artifacts¶
"""Snippet for exporting project artifacts from a Tamr project"""
import tamr_toolbox as tbox
from tamr_toolbox.project import export_artifacts
from tamr_toolbox.models.project_artifacts import CategorizationArtifacts
# Read config, make Tamr Client
tamr = tbox.utils.client.create(username="user", password="pw", host="localhost")
# Get project object
project = tamr.projects.by_resource_id("my_project_id")
# Set path to export directory (on server containing tamr instance)
path_export_dir = "/home/ubuntu/tamr/projectExports"
# Make list of categorization artifacts to exclude.
# You can spell out the artifact code if known,
# or list access via the CategorizationArtifacts dataclass
exclude_list = [
CategorizationArtifacts.CATEGORIZATION_VERIFIED_LABELS,
"CATEGORIZATION_TAXONOMIES",
CategorizationArtifacts.CATEGORIZATION_FEEDBACK,
]
# Export project artifacts
op = export_artifacts(
project=project,
artifact_directory_path=path_export_dir,
exclude_artifacts=exclude_list,
asynchronous=False,
)
# Print operation
print(op)
Fork Project¶
"""Example script for creating a copy of an existing project with a new name
To copy a project as "_copy"-postfixed name of the target project:
python fork_project.py --export_path /path/to/export-file-dir/
--project_name <target_project_name>
--postfix _copy
To specify new project name explicitly:
python fork_project.py --export_path /path/to/export-file-dir/
--project_name <target_project_name>
--new_name <new_project_name>
"""
import os
import re
from pathlib import Path
from typing import Dict, Any
import argparse
import tamr_toolbox as tbox
from tamr_toolbox import utils as tbu
from tamr_toolbox.utils.client import Client
from tamr_toolbox.utils.operation import Operation
def export_from_tamr(client: Client, *, project_name: str, export_path: str) -> Operation:
"""
This function sets path for project artifacts export from Tamr and makes the call to execute
the export action.
Export path defaults to "project-movement/<project_name>" if no user-defined value is passed.py
Args:
client: an instance of Tamr unify client object
project_name: name of the project to be exported from Tamr
export_path: export path - must be accessible to the VM hosting Tamr
Returns:
operation for project export api call
"""
project = client.projects.by_name(project_name)
if not export_path:
export_path = os.path.join(
Path(__file__).resolve().parent, f"project-movement/{project_name}"
)
return tbox.project.export_artifacts(project=project, artifact_directory_path=export_path)
def main(
*,
project_name: str,
new_name: str,
postfix: str,
new_ud_name: str,
export_path: Path,
overwrite: bool,
instance_connection_info: Dict[str, Any],
):
"""
This function creates a fork copy of a Tamr project by exporting the target project
and importing it back to Tamr under a new name
Args:
project_name: name of the existing target project
new_name: name of the forked project (Optional)
postfix: if specified, will use and modify the target project name (Optional)
new_ud_name: explicitly specify the name of unified dataset of forked project (Optional)
export_path: export path - must be accessible to the VM hosting Tamr
overwrite: flag to overwrite existing project artifacts
instance_connection_info: Tamr instance & AUTH configs
"""
tamr_client = tbu.client.create(**instance_connection_info)
# calling the action functions:
# exporting the target project from tamr
LOGGER.info(f"Project {project_name} export from Tamr initializing...")
op = export_from_tamr(tamr_client, project_name=project_name, export_path=export_path)
tbu.operation.enforce_success(op)
LOGGER.info(op)
## preparing for the import
# finding the path to export file from op JSON response
zipfile_path = re.findall(r":\s?(.+\.zip)", op.description)[0]
# constructing the new project name and respective unified dataset name to be imported to tamr
new_project_name = new_name if new_name else f"{project_name}{postfix}"
new_ud_name = new_ud_name if new_ud_name else f"{new_project_name}_unified_dataset"
# importing a copy of target project to tamr
LOGGER.info(f"Project {new_project_name} import to Tamr initializing...")
op = tbox.project.import_artifacts(
tamr_client=tamr_client,
project_artifact_path=zipfile_path,
new_project_name=new_project_name,
new_unified_dataset_name=new_ud_name,
overwrite_existing=overwrite,
)
tbu.operation.enforce_success(op)
LOGGER.info(op)
LOGGER.info(
f"Project {new_project_name} was successfully forked from Tamr project {project_name}!"
)
if __name__ == "__main__":
# parse args
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
parser.add_argument(
"--project_name",
default=None,
required=True,
help="raise to specify the name of target project",
)
parser.add_argument(
"--export_path",
default=None,
required=True,
help="raise to specify the path to export directory",
)
parser.add_argument(
"--new_ud_name",
default=None,
help="raise to explicitly specify the name of forked project unified dataset (optional)",
)
parser.add_argument(
"--overwrite",
default=False,
action="store_true",
help="if raised will replace a project with specified target name (if one already exists)",
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"--new_name", default=None, help="raise to explicitly specify the name of the new project"
)
group.add_argument(
"--postfix",
default=None,
help="raise to imply the name of the new project by postfixing the original project name",
)
opts = parser.parse_args()
# load config file and create tamr client
CONFIG = tbox.utils.config.from_yaml(
path_to_file=opts.config, default_path_to_file="/path/to/my/conf/project.config.yaml"
)
# creating the logger object:
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Let Tamr Toolbox itself also contribute to the log
tbu.logger.enable_toolbox_logging(log_directory=CONFIG["logging_dir"], log_to_terminal=False)
main(
project_name=opts.project_name,
new_name=opts.new_name,
postfix=opts.postfix,
new_ud_name=opts.new_ud_name,
export_path=opts.export_path,
overwrite=opts.overwrite,
instance_connection_info=CONFIG["tamr"],
)