Dataset¶
Manage Datasets¶
Create a dataset¶
# create_dataset.config.yaml
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
my_tamr_instance:
host: $TAMR_HOST # Example: "1.2.3.4"
protocol: "http"
port: "9100"
username: "admin"
password: $TAMR_PASSWORD # Example: "abc123"
datasets:
my_mastering_project_dataset:
name: "test_create_dataset"
attributes: ["unique_id", "name", "address"]
primary_keys: ["unique_id"]
description: "Dataset for testing"
"""Example script for creating a dataset"""
import argparse
from typing import Dict, Any, List
import tamr_toolbox as tbox
def main(
*,
instance_connection_info: Dict[str, Any],
dataset_name: str,
attributes: List[str],
primary_keys: List[str],
description: str,
) -> None:
"""Creates a dataset in Tamr
Args:
instance_connection_info: Information for connecting to Tamr (host, port, username etc)
dataset_name: name of dataset
attributes: list of attributes to create
primary_key: primary key for dataset
description: description of dataset
"""
# Create the tamr client
tamr_client = tbox.utils.client.create(**instance_connection_info)
LOGGER.info(f"Creating dataset: {dataset_name}")
tbox.dataset.manage.create(
client=tamr_client,
dataset_name=dataset_name,
primary_keys=primary_keys,
attributes=attributes,
description=description,
)
if __name__ == "__main__":
# Set up command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to a YAML configuration file", required=False)
args = parser.parse_args()
# Load the configuration from the file path provided or the default file path specified
CONFIG = tbox.utils.config.from_yaml(
path_to_file=args.config,
default_path_to_file="examples/resources/conf/create_dataset.config.yaml",
)
# Use the configuration to create a global logger
LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])
# Run the main function
main(
instance_connection_info=CONFIG["my_tamr_instance"],
dataset_name=CONFIG["datasets"]["my_source_dataset"]["name"],
attributes=CONFIG["datasets"]["my_source_dataset"]["attributes"],
primary_keys=CONFIG["datasets"]["my_source_dataset"]["primary_key"],
description=CONFIG["datasets"]["my_source_dataset"]["description"],
)
Migrate dataset definition changes from a source to target instance¶
# migrate_dataset.config.yaml
source_migration_instance:
host: $DEV_HOST
protocol: "http"
port: "9100"
username: $TAMR_USERNAME
password: $TAMR_PASSWORD
target_migration_instance:
host: $PROD_HOST
protocol: "http"
port: "9100"
username: $TAMR_USERNAME
password: $TAMR_PASSWORD
logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"
datasets:
- dataset_name: "test_dataset"
"""
An example script to migrate changes to a dataset attributes from one instance of Tamr to another
"""
import tamr_toolbox as tbox
# load example multi config
my_config = tbox.utils.config.from_yaml("examples/resources/conf/migrate_dataset.config.yaml")
# Create the source tamr client
source_client = tbox.utils.client.create(**my_config["source_migration_instance"])
# Create the target tamr client
target_client = tbox.utils.client.create(**my_config["target_migration_instance"])
# Update each dataset
datasets = my_config["datasets"]
for ds in datasets:
dataset_name = ds["dataset_name"]
# Get dataset from source instance
source_dataset = source_client.datasets.by_name(dataset_name)
# Get updated dataset definition
attr_type_dict = {
attr.name: tbox.models.attribute_type.from_json(attr.spec().to_dict()["type"])
for attr in source_dataset.attributes
}
attribute_names = attr_type_dict.keys()
description = source_dataset.description
tags = source_dataset.tags
# Migrate dataset updates from source to target instance
if tbox.dataset.manage.exists(client=target_client, dataset_name=dataset_name):
target_dataset = target_client.datasets.by_name(dataset_name)
tbox.dataset.manage.update(
dataset=target_dataset,
attributes=attribute_names,
attribute_types=attr_type_dict,
description=description,
tags=tags,
override_existing_types=True,
)
else:
primary_keys = source_dataset.spec().to_dict()["keyAttributeNames"]
tbox.dataset.manage.create(
client=target_client, primary_keys=primary_keys, dataset=source_dataset
)
Add attributes to a dataset¶
"""
An example script to change dataset attributes for a Tamr dataset
"""
import tamr_toolbox as tbox
from tamr_toolbox.models import attribute_type
# load example multi config
my_config = tbox.utils.config.from_yaml("examples/resources/conf/migrate_dataset.config.yaml")
# Create the source tamr client
client = tbox.utils.client.create(**my_config["source_migration_instance"])
# Get dataset from Tamr instance
dataset_name = "<your dataset name>"
dataset = client.datasets.by_name(dataset_name)
# Note you can optionally just past in a list of attribute names
# The attribute types will default to ARRAY STRING
attribute_names = ["client_id", "name", "address", "user_id", "sales", "location"]
attribute_types = {
"client_id": attribute_type.STRING,
"name": attribute_type.DEFAULT,
"address": attribute_type.DEFAULT,
"user_id": attribute_type.Array(attribute_type.INT),
"sales": attribute_type.DOUBLE,
"location": attribute_type.GEOSPATIAL,
}
updated_dataset = tbox.dataset.manage.create_attributes(
dataset=dataset, attributes=attribute_names, attribute_types=attribute_types
)
Remove an attribute from a dataset¶
"""
An example script to delete attributes from a Tamr dataset
"""
import tamr_toolbox as tbox
# load example multi config
my_config = tbox.utils.config.from_yaml("examples/resources/conf/migrate_dataset.config.yaml")
# Create the source tamr client
client = tbox.utils.client.create(**my_config["source_migration_instance"])
# Get dataset from Tamr instance
dataset_name = "<your dataset name>"
dataset = client.datasets.by_name(dataset_name)
attribute_name = ["location"]
updated_dataset = tbox.dataset.manage.delete_attributes(
dataset=dataset, attributes=[attribute_name]
)
Update records within a dataset¶
"""
An example script to add and delete a dataset's records
"""
import tamr_toolbox as tbox
# load example multi config
my_config = tbox.utils.config.from_yaml("examples/resources/conf/migrate_dataset.config.yaml")
# Create the source tamr client
client = tbox.utils.client.create(**my_config["source_migration_instance"])
# Get dataset from Tamr instance
dataset_name = "<your dataset name>"
dataset = client.datasets.by_name(dataset_name)
"""
The following example upserts a sample record with a primary key of "27" and deletes an existing
record with a primary key of "14", assuming that the name of the primary key attribute is
"primaryKey".
"""
updates = [{"First_Name": ["John"], "Last_Name": ["Smith"]}, "delete"]
primary_keys = ["27", "14"]
dataset = tbox.dataset.manage.update_records(
dataset, updates=updates, primary_keys=primary_keys, primary_key_name="primaryKey"
)
"""
The following example deletes the records having primary keys "7" and "13" in the same dataset as
above.
"""
primary_keys = ["7", "13"]
dataset = tbox.dataset.manage.update_records(
dataset, delete_all=True, primary_keys=primary_keys, primary_key_name="primaryKey"
)
Dataset Profiles¶
Create and retrieve a profile for a dataset¶
"""
An example script to create and/or retrieve a dataset profile
"""
import tamr_toolbox as tbox
# load example config
my_config = tbox.utils.config.from_yaml("examples/resources/conf/migrate_dataset.config.yaml")
# Create the source tamr client
client = tbox.utils.client.create(**my_config["source_migration_instance"])
# Get dataset from Tamr instance
dataset_name = "<your dataset name>"
dataset = client.datasets.by_name(dataset_name)
"""
Setting the optional second argument to True below ensures that
- A new profile is created if it does not exist or is in an inconsistent state
- The profile is refreshed if it is out-of-date
"""
profile = tbox.dataset.get_profile(dataset, allow_create_or_refresh=True)