"""Tasks related to creating, updating, saving, and moving address validation data from Tamr"""
import copy
import json
import logging
import os
from dataclasses import asdict, dataclass, fields
from typing import Dict, List, Optional, Union
from requests.exceptions import HTTPError
from tamr_unify_client.dataset.collection import DatasetCollection
from tamr_unify_client.dataset.resource import Dataset
from typing_extensions import Literal
from tamr_toolbox.enrichment.enrichment_utils import create_empty_mapping
LOGGER = logging.getLogger(__name__)
Granularity = Literal[
"GRANULARITY_UNSPECIFIED",
"SUB_PREMISE",
"PREMISE",
"PREMISE_PROMXIMITY",
"BLOCK",
"ROUTE",
"OTHER",
]
[docs]@dataclass
class AddressValidationMapping:
"""DataClass for address validation data.
Args:
input_address: input address
validated_formatted_address: the "formattedAddress" returns by the validation API, if any
expiration: the expiration timestamp of the data, 30 days from API call
region_code: region code returned by the validation API
postal_code: postal code returned by the validation API
admin_area: administrative area returned by the validation API (state for US addresses)
locality: locality returned by the validation API (city/town for US addresses)
address_lines: address lines returned by the validation API (e.g. ['66 Church St'])
usps_firstAddressLine: first address line in validated USPS format, if available
usps_cityStateZipAddressLine: : second address line in validated USPS format, if available
usps_city: city in validated USPS format, if available
usps_state: state in validated USPS format, if available
usps_zipCode: str = "
latitude: latitude associated with validated address, if any
longitude: longitude associated with validated address, if any
place_id: the google placeId -- the only result field not subject to the `expiration`
input_granularity: granularity of input given by validation API
validation_granularity: granularity of validation given by validation API
geocode_granularity: granularity of geocode given by validation API
has_inferred: whether the result has inferred components
has_unconfirmed: whether the result has unconfirmed components
has_replaced: whether the result has replaced components
address_complete: whether the input was complete
"""
input_address: str
validated_formatted_address: Optional[str]
expiration: str # timestamp in the format given by `str(datetime.now())`
region_code: Optional[str]
postal_code: Optional[str]
admin_area: Optional[str]
locality: Optional[str]
address_lines: List[str]
usps_first_address_line: Optional[str]
usps_city_state_zip_line: Optional[str]
usps_city: Optional[str]
usps_state: Optional[str]
usps_zip_code: Optional[str]
latitude: Optional[float]
longitude: Optional[float]
place_id: Optional[str]
input_granularity: Granularity
validation_granularity: Granularity
geocode_granularity: Granularity
has_inferred: bool
has_unconfirmed: bool
has_replaced: bool
address_complete: bool
[docs]def to_dict(
dictionary: Dict[str, AddressValidationMapping]
) -> List[Dict[str, Union[str, List[str], float, None]]]:
"""
Convert a toolbox address validation mapping entries to list-of-dictionary format.
Args:
dictionary: a toolbox address validation mapping
Returns:
A list of toolbox address validation mapping entries in dictionary format
"""
return [asdict(t) for t in dictionary.values()]
[docs]def update(
main_dictionary: Dict[str, AddressValidationMapping],
tmp_dictionary: Dict[str, AddressValidationMapping],
) -> None:
"""
Update a toolbox address validation mapping with another temporary address validation mapping
Args:
main_dictionary: the main toolbox address validation mapping containing prior results
tmp_dictionary: a temporary toolbox address validation mapping containing new data
"""
for input_addr, mapping in tmp_dictionary.items():
main_dictionary[input_addr] = copy.copy(mapping)
[docs]def from_dataset(dataset: Dataset) -> Dict[str, AddressValidationMapping]:
"""
Stream an address validation mapping dataset from Tamr.
Args:
dataset: Tamr Dataset object
Returns:
A toolbox address validation mapping
Raises:
ValueError: if the provided `dataset` is not a toolbox address validation mapping dataset
NameError: if the provided `dataset` does not contain all the attributes of a
toolbox address validation mapping
RuntimeError: if there is any other problem while reading the `dataset` as a
toolbox address validation mapping
"""
if dataset.key_attribute_names[0] != "input_address":
error_message = "Provided Tamr Dataset is not a toolbox address validation mapping"
LOGGER.error(error_message)
raise ValueError(error_message)
dictionary = {}
for record in dataset.records():
try:
# Values are returned as a length-1 list of string, we change this to strings
entry = AddressValidationMapping(
input_address=record["input_address"],
validated_formatted_address=record["validated_formatted_address"][0],
expiration=record["expiration"][0],
region_code=record["region_code"][0] if record["region_code"] else None,
postal_code=record["postal_code"][0] if record["postal_code"] else None,
admin_area=record["admin_area"][0] if record["admin_area"] else None,
locality=record["locality"][0] if record["locality"] else None,
address_lines=record["address_lines"] if record["address_lines"] else [],
usps_first_address_line=record["usps_first_address_line"]
if record["usps_first_address_line"]
else None,
usps_city_state_zip_line=record["usps_city_state_zip_line"]
if record["usps_city_state_zip_line"]
else None,
usps_city=record["usps_city"] if record["usps_city"] else None,
usps_state=record["usps_state"] if record["usps_state"] else None,
usps_zip_code=record["usps_zip_code"] if record["usps_zip_code"] else None,
latitude=float(record["latitude"][0]) if record["latitude"] else None,
longitude=float(record["longitude"][0]) if record["longitude"] else None,
place_id=record["place_id"][0] if record["place_id"] else None,
input_granularity=record["input_granularity"][0]
if record["input_granularity"]
else "GRANULARITY_UNSPECIFIED",
validation_granularity=record["validation_granularity"][0]
if record["validation_granularity"]
else "GRANULARITY_UNSPECIFIED",
geocode_granularity=record["geocode_granularity"][0]
if record["geocode_granularity"]
else "GRANULARITY_UNSPECIFIED",
has_inferred=record["has_inferred"][0] if record["has_inferred"] else False,
has_unconfirmed=record["has_unconfirmed"][0]
if record["has_unconfirmed"]
else False,
has_replaced=record["has_replaced"][0] if record["has_replaced"] else False,
address_complete=record["address_complete"][0]
if record["address_complete"]
else False,
)
except KeyError as exp:
error_message = (
f"Supplied Tamr dataset is not in toolbox address validation mapping format: {exp}"
)
LOGGER.error(error_message)
raise NameError(error_message) from exp
except Exception as exp:
error_message = f"Error while reading Tamr dataset address validation mapping: {exp}"
LOGGER.error(error_message)
raise RuntimeError(error_message) from exp
dictionary.update({entry.input_address: entry})
return dictionary
[docs]def to_dataset(
addr_mapping: Dict[str, AddressValidationMapping],
*,
dataset: Optional[Dataset] = None,
datasets_collection: Optional[DatasetCollection] = None,
create_dataset: bool = False,
dataset_name: str = "address_validation_mapping",
) -> str:
"""Ingest a toolbox address validation mapping in Tamr, creating the source dataset if needed.
Args:
addr_mapping: a toolbox address validation mapping
dataset: a Tamr client dataset
datasets_collection: a Tamr client datasets collection
create_dataset: flag to create or upsert to an existing address validation mapping
source dataset
dataset_name: name to use if creating new dataset
Returns:
The name of the created or updated Tamr Dataset
Raises:
ValueError: if `create_dataset` is False and `dataset` is not provided or is not a
toolbox address validation mapping dataset.
If `create_dataset` is True but `datasets_collection` or `target_language` or
`source_language` is missing or the Tamr dataset already exists
RuntimeError: if there is an error during the creation of the Tamr dataset attributes
"""
if create_dataset is False:
if dataset is None:
error_message = (
"Tamr Client Dataset missing from inputs. Please provide a Tamr "
"Client Dataset if updating an existing address validation dataset"
)
LOGGER.error(error_message)
raise ValueError(error_message)
if dataset.key_attribute_names[0] != "input_address":
error_message = "Provided Tamr Dataset is not a toolbox address validation mapping"
LOGGER.error(error_message)
raise ValueError(error_message)
else:
if not datasets_collection:
error_message = (
"Tamr Datasets Collection must be specified to create address validation dataset."
)
LOGGER.error(error_message)
raise ValueError(error_message)
if dataset_name in [d.name for d in datasets_collection]:
error_message = (
f"Tamr Dataset {dataset_name} already exists on Tamr, you cannot duplicate it."
)
LOGGER.error(error_message)
raise ValueError(error_message)
LOGGER.info("Creating toolbox address validation dataset %s on Tamr", dataset_name)
creation_spec = {"name": dataset_name, "keyAttributeNames": ["input_address"]}
dataset = datasets_collection.create(creation_spec)
attributes = dataset.attributes
for attribute in [att.name for att in fields(AddressValidationMapping)]:
if attribute == "input_address":
continue
attr_spec = {
"name": attribute,
"type": {"baseType": "ARRAY", "innerType": {"baseType": "STRING"}},
}
try:
attributes.create(attr_spec)
except HTTPError as exp:
error_message = (
f"Error while creating attribute {attribute} for dataset {dataset_name}: {exp}"
)
LOGGER.error(error_message)
raise RuntimeError(error_message) from exp
LOGGER.info("Ingesting toolbox address validation mapping to Tamr")
dataset.upsert_records(records=to_dict(addr_mapping), primary_key_name="input_address")
return dataset.name
[docs]def to_json(dictionary: Dict[str, AddressValidationMapping]) -> List[str]:
"""
Convert a toolbox address validation mapping entries to a json format where set object are
converted to list
Args:
dictionary: a toolbox address validation mapping
Returns:
A list of toolbox address validation mapping entries in json format
"""
return [json.dumps(asdict(t)) for t in dictionary.values()]
[docs]def save(
addr_mapping: Dict[str, AddressValidationMapping],
addr_folder: str,
filename: str = "address_validation_mapping.json",
) -> None:
"""
Save a toolbox address validation mapping to disk
Args:
addr_mapping: dictionary object to be saved to disk
addr_folder: base directory where mapping is saved
filename: filename to use to save
"""
addr_filepath = os.path.join(addr_folder, filename)
if len(addr_mapping) > 0:
LOGGER.debug("Writing address mapping to file")
with open(addr_filepath, "w") as f:
f.write("\n".join(to_json(addr_mapping)))
[docs]def load(
addr_folder: str, filename: str = "address_validation_mapping.json"
) -> Dict[str, AddressValidationMapping]:
"""
Load a toolbox address validation mapping from disk to memory
Args:
addr_folder: base directory where mapping is saved
filename: filename where mapping is saved
Returns:
A toolbox address validation mapping
Raises:
RuntimeError: if the file was found on disk but is not of a valid toolbox address
validation mapping type
"""
filepath = os.path.join(addr_folder, filename)
if not os.path.exists(filepath):
LOGGER.info("Dictionary %s does not exist, creating an empty one.", filepath)
filepath = create_empty_mapping(path=filepath)
return {}
with open(filepath, "r") as f:
mapping_lst = [json.loads(line) for line in f.readlines()]
try:
# Tranform the loaded dictionaries into a AddressValidationMapping
mapping_lst = [AddressValidationMapping(**t) for t in mapping_lst if t]
# Make the standardized phrase the main key of the address validation mapping
mapping_dict = {t.input_address: t for t in mapping_lst}
except Exception as excp:
error_message = (
f"Could not read address validation mapping at {filepath}. "
f"Check that the dictionary is of the correct type. Error: {excp}"
)
LOGGER.error(error_message)
raise RuntimeError(error_message) from excp
return mapping_dict