Harmony Suite CUSTOM SoT Build

This section describes how to generate the Harmony Suite CUSTOM SoT (Source of Truth) Build

Steps to Build Harmony Suite CUSTOM SoT (Source of Truth) Build

Please find below steps in building a CUSTOM SoT.

Preprocess the raw input data
- Ensure raw input data is ready for processing. At this point you can do some level of data quality assurance and field reformatting.

Sample Step 1 Preprocessing Script

import csv
from datetime import datetime
import re


# Function to transform date from ccyymmdd000000 to DD/MM/CCYY
def transform_date(date_str):
    if date_str and len(date_str) == 14:
        return datetime.strptime(date_str[:8], '%Y%m%d').strftime('%d/%m/%Y')
    return date_str

def split_rd_location(input_string):
    match = re.match(r'(RD \d+) (.+)', input_string)
    if match:
        return match.group(1), match.group(2)
    else:
        return None, None
    
def run_step_1(input_dir: str, processed_dir: str):
    # Process the CSV file to update the date fields and apply the new logic

    input_file_path = './TIL_ADDRESS.csv'
    temp_file_path = './TIL_ADDRESS_step1.csv'

    with open(input_file_path, 'r', encoding='utf-8-sig') as infile, open(temp_file_path, 'w', encoding='utf-8', newline='') as outfile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames
    
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            # Transform date fields in the input file
            for date_field in ['CREATED_DATE', 'ALPHA_MODIFIED_DATE', 'GEOM_MODIFIED_DATE']:
                if date_field in row:
                    row[date_field] = transform_date(row[date_field])
        
            # Apply the new logic: if HOUSE_LOW equals HOUSE_HIGH, empty HOUSE_HIGH
            if row['HOUSE_LOW'] == row['HOUSE_HIGH']:
                row['HOUSE_HIGH'] = ''

            if row['FLAT_LOW'] == row['FLAT_HIGH']:
                row['FLAT_HIGH'] = ''

            if row['HOUSE_LOW_SUFFIX'] == row['HOUSE_HIGH_SUFFIX']:
                row['HOUSE_HIGH_SUFFIX'] = ''
        
            if row['RURAL_DELIVERY_NUMBER'] != '':
                til_town_name = row['TIL_TOWN_NAME']
                rd, location = split_rd_location(row['RURAL_DELIVERY_NUMBER'])
                row['RURAL_DELIVERY_NUMBER'] = rd
                if til_town_name != '':
                    row['TIL_TOWN_NAME'] = location
        
            # Assemble FULL_ADDRESS
            subdwelling = ""
            if row['FLAT_HIGH']:
                subdwelling = f"{row['UNIT_TYPE']} {row['FLAT_LOW']}-{row['FLAT_HIGH']}/"
            else:
                subdwelling = f"{row['UNIT_TYPE']} {row['FLAT_LOW']}"
        
            street = ""
            if row['HOUSE_HIGH']:
                street = f"{row['HOUSE_LOW']}{row['HOUSE_LOW_SUFFIX']}-{row['HOUSE_HIGH']}{row['HOUSE_HIGH_SUFFIX']}"
            else:
                street = f"{row['HOUSE_LOW']}{row['HOUSE_LOW_SUFFIX']}"
        
            full_address = f"{subdwelling} {street} {row['FULL_PRIMARY_ROAD_NAME']}, {row['LOCALITY_NAME']} {row['TIL_TOWN_NAME']} {row['POSTCODE']}"
            row['FULL_ADDRESS'] = full_address.strip()
        
            # Write the updated row to the temporary file
            writer.writerow(row)

    # Replace the original file with the updated file
    # import os
    # os.replace(temp_file_path, input_file_path)

    print(f"Updated input file saved as {temp_file_path}.")

if __name__ == "__main__":
    input_dir = "path_to_input_directory"
    processed_dir = "path_to_processed_directory"
    run_step_1(input_dir, processed_dir)

Field Mapping

This step is mapping address components to their respective fields in the index.

Sample Step 2 Field Mapping

import csv

#from app.app_settings import app_settings
# from utils.files import copy_file_to_dir

# Define the field mapping
field_mapping = {
    "ID": "id",
    "UNIT_TYPE": "flat_type",
    "FLAT_LOW": "flat_number_1",
    "FLAT_HIGH": "flat_number_2",
    "HOUSE_LOW": "street_number_1",
    "HOUSE_HIGH": "street_number_2",
    "HOUSE_LOW_SUFFIX": "street_number_1_suffix",
    "HOUSE_HIGH_SUFFIX": "street_number_2_suffix",
    "LEVEL_NO": "level_number",
    "HABITATION_NAME": "building_name",
    "PRIMARY_NAME": "street_name",
    "PRIMARY_TYPE": "street_type",
    "PRIMARY_SUFFIX": "street_suffix1",
    "FULL_ADDRESS": "full_address",
    "LOCALITY_NAME": "locality",    
    "TIL_TA_NAME": "city",
    "POSTCODE": "postcode",
    "RURAL_DELIVERY_NUMBER": "delivery_number",
    "WGS84_LONG": "longitude",
    "WGS84_LAT": "latitude"
}

# Process the transformed CSV file
def run_step_2(processing_dir: str, output_dir: str):
    input_file_path = './TIL_ADDRESS_step1.csv'

    output_file_path = './tui-custom.txt'

    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8', newline='') as outfile:
    
        reader = csv.DictReader(infile)
        fieldnames = [field_mapping.get(field, field) for field in reader.fieldnames]
        fieldnames.append("delivery_type")  # Add the new field to the header
    
        # Write the updated header to the output file
        outfile.write('|'.join(fieldnames) + '\n')

        for row in reader:
            # Exclude records with ADDRESS_TYPE = 'Alias'
            if row.get('ADDRESS_TYPE') == 'Alias':
                continue
        
            updated_row = {field_mapping.get(key, key): value for key, value in row.items()}
        
            # Ensure the FULL_ADDRESS field is not enclosed in double quotes
            if 'full_address' in updated_row:
                updated_row['full_address'] = updated_row['full_address'].replace('"', '')
        
            # Add the delivery_type field based on RURAL_DELIVERY_NUMBER
            delivery_number = updated_row.get('delivery_number', '')
            delivery_type = f"RD {delivery_number}" if delivery_number else ""
            updated_row['delivery_type'] = delivery_type

            # Write the updated row to the output file
            outfile.write('|'.join(updated_row.get(field, '') for field in fieldnames) + '\n')

    print(f"Processed file saved as {output_file_path}.")

if __name__ == "__main__":
    input_dir = "path_to_input_directory"
    processed_dir = "path_to_processed_directory"
    output_dir = "path_to_output_directory"

    # Call the second step
    run_step_2(processed_dir, output_dir)