Dataset

Manage Datasets

Create a dataset

# create_dataset.config.yaml

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

my_tamr_instance:
    host: $TAMR_HOST # Example: "1.2.3.4"
    protocol: "http"
    port: "9100"
    username: "admin"
    password: $TAMR_PASSWORD  # Example: "abc123"

datasets:
    my_mastering_project_dataset:
        name: "test_create_dataset"
        attributes: ["unique_id", "name", "address"]
        primary_keys: ["unique_id"]
        description: "Dataset for testing"
"""Example script for creating a dataset"""
import argparse
from typing import Dict, Any, List

import tamr_toolbox as tbox


def main(
    *,
    instance_connection_info: Dict[str, Any],
    dataset_name: str,
    attributes: List[str],
    primary_keys: List[str],
    description: str,
) -> None:
    """Creates a dataset in Tamr

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        dataset_name: name of dataset
        attributes: list of attributes to create
        primary_key: primary key for dataset
        description: description of dataset
    """
    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    LOGGER.info(f"Creating dataset: {dataset_name}")

    tbox.dataset.manage.create(
        client=tamr_client,
        dataset_name=dataset_name,
        primary_keys=primary_keys,
        attributes=attributes,
        description=description,
    )


if __name__ == "__main__":
    # Set up command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to a YAML configuration file", required=False)
    args = parser.parse_args()

    # Load the configuration from the file path provided or the default file path specified
    CONFIG = tbox.utils.config.from_yaml(
        path_to_file=args.config,
        default_path_to_file="examples/resources/conf/create_dataset.config.yaml",
    )

    # Use the configuration to create a global logger
    LOGGER = tbox.utils.logger.create(__name__, log_directory=CONFIG["logging_dir"])

    # Run the main function
    main(
        instance_connection_info=CONFIG["my_tamr_instance"],
        dataset_name=CONFIG["datasets"]["my_source_dataset"]["name"],
        attributes=CONFIG["datasets"]["my_source_dataset"]["attributes"],
        primary_keys=CONFIG["datasets"]["my_source_dataset"]["primary_key"],
        description=CONFIG["datasets"]["my_source_dataset"]["description"],
    )

Migrate dataset definition changes from a source to target instance

# migrate_dataset.config.yaml

source_migration_instance:
    host: $DEV_HOST
    protocol: "http"
    port: "9100"
    username: $TAMR_USERNAME
    password: $TAMR_PASSWORD

target_migration_instance:
    host: $PROD_HOST
    protocol: "http"
    port: "9100"
    username: $TAMR_USERNAME
    password: $TAMR_PASSWORD

logging_dir: $TAMR_PROJECT_LOGGING_DIR # Example: "/home/users/jane/my-project/logs"

datasets:
    - dataset_name: "test_dataset"
"""
An example script to migrate changes to a dataset attributes from one instance of Tamr to another
"""
import tamr_toolbox as tbox

# load example multi config
my_config = tbox.utils.config.from_yaml("examples/resources/conf/migrate_dataset.config.yaml")

# Create the source tamr client
source_client = tbox.utils.client.create(**my_config["source_migration_instance"])

# Create the target tamr client
target_client = tbox.utils.client.create(**my_config["target_migration_instance"])

# Update each dataset
datasets = my_config["datasets"]
for ds in datasets:
    dataset_name = ds["dataset_name"]

    # Get dataset from source instance
    source_dataset = source_client.datasets.by_name(dataset_name)

    # Get updated dataset definition
    attr_type_dict = {
        attr.name: tbox.models.attribute_type.from_json(attr.spec().to_dict()["type"])
        for attr in source_dataset.attributes
    }
    attribute_names = attr_type_dict.keys()

    description = source_dataset.description
    tags = source_dataset.tags

    # Migrate dataset updates from source to target instance
    if tbox.dataset.manage.exists(client=target_client, dataset_name=dataset_name):
        target_dataset = target_client.datasets.by_name(dataset_name)
        tbox.dataset.manage.update(
            dataset=target_dataset,
            attributes=attribute_names,
            attribute_types=attr_type_dict,
            description=description,
            tags=tags,
            override_existing_types=True,
        )
    else:
        primary_keys = source_dataset.spec().to_dict()["keyAttributeNames"]
        tbox.dataset.manage.create(
            client=target_client, primary_keys=primary_keys, dataset=source_dataset
        )

Add attributes to a dataset

"""
An example script to change dataset attributes for a Tamr dataset
"""
import tamr_toolbox as tbox
from tamr_toolbox.models import attribute_type


# load example multi config
my_config = tbox.utils.config.from_yaml("examples/resources/conf/migrate_dataset.config.yaml")

# Create the source tamr client
client = tbox.utils.client.create(**my_config["source_migration_instance"])

# Get dataset from Tamr instance
dataset_name = "<your dataset name>"
dataset = client.datasets.by_name(dataset_name)

# Note you can optionally just past in a list of attribute names
# The attribute types will default to ARRAY STRING
attribute_names = ["client_id", "name", "address", "user_id", "sales", "location"]
attribute_types = {
    "client_id": attribute_type.STRING,
    "name": attribute_type.DEFAULT,
    "address": attribute_type.DEFAULT,
    "user_id": attribute_type.Array(attribute_type.INT),
    "sales": attribute_type.DOUBLE,
    "location": attribute_type.GEOSPATIAL,
}

updated_dataset = tbox.dataset.manage.create_attributes(
    dataset=dataset, attributes=attribute_names, attribute_types=attribute_types
)

Remove an attribute from a dataset

"""
An example script to delete attributes from a Tamr dataset
"""
import tamr_toolbox as tbox

# load example multi config
my_config = tbox.utils.config.from_yaml("examples/resources/conf/migrate_dataset.config.yaml")

# Create the source tamr client
client = tbox.utils.client.create(**my_config["source_migration_instance"])

# Get dataset from Tamr instance
dataset_name = "<your dataset name>"
dataset = client.datasets.by_name(dataset_name)


attribute_name = ["location"]

updated_dataset = tbox.dataset.manage.delete_attributes(
    dataset=dataset, attributes=[attribute_name]
)

Update records within a dataset

"""
An example script to add and delete a dataset's records
"""
import tamr_toolbox as tbox

# load example multi config
my_config = tbox.utils.config.from_yaml("examples/resources/conf/migrate_dataset.config.yaml")

# Create the source tamr client
client = tbox.utils.client.create(**my_config["source_migration_instance"])

# Get dataset from Tamr instance
dataset_name = "<your dataset name>"
dataset = client.datasets.by_name(dataset_name)

"""
The following example upserts a sample record with a primary key of "27" and deletes an existing
record with a primary key of "14", assuming that the name of the primary key attribute is
"primaryKey".
"""
updates = [{"First_Name": ["John"], "Last_Name": ["Smith"]}, "delete"]
primary_keys = ["27", "14"]
dataset = tbox.dataset.manage.update_records(
    dataset, updates=updates, primary_keys=primary_keys, primary_key_name="primaryKey"
)

"""
The following example deletes the records having primary keys "7" and "13" in the same dataset as
above.
"""
primary_keys = ["7", "13"]
dataset = tbox.dataset.manage.update_records(
    dataset, delete_all=True, primary_keys=primary_keys, primary_key_name="primaryKey"
)

Dataset Profiles

Create and retrieve a profile for a dataset

"""
An example script to create and/or retrieve a dataset profile
"""

import tamr_toolbox as tbox

# load example config
my_config = tbox.utils.config.from_yaml("examples/resources/conf/migrate_dataset.config.yaml")

# Create the source tamr client
client = tbox.utils.client.create(**my_config["source_migration_instance"])

# Get dataset from Tamr instance
dataset_name = "<your dataset name>"
dataset = client.datasets.by_name(dataset_name)

"""
Setting the optional second argument to True below ensures that
    - A new profile is created if it does not exist or is in an inconsistent state
    - The profile is refreshed if it is out-of-date
"""
profile = tbox.dataset.get_profile(dataset, allow_create_or_refresh=True)