Monday, 15 December 2025

SCD 2 TYPE IMPLEMENTATION WITH DIM AND FACT TABLE

 -- ===============================================

-- COMPLETE SCD TYPE 2 ETL EXAMPLE

-- ===============================================

-- This example shows the complete flow from source to fact tables

-- with SCD Type 2 dimension handling


-- ===============================================

-- STEP 1: CREATE SOURCE TABLES (Simulating OLTP)

-- ===============================================


-- Source Customer Table (Business System)

CREATE TABLE source_customer (

    customer_id INT PRIMARY KEY,

    customer_name VARCHAR(100),

    email VARCHAR(100),

    address VARCHAR(200),

    phone VARCHAR(20),

    last_modified_date DATE

);


-- Source Product Table

CREATE TABLE source_product (

    product_id INT PRIMARY KEY,

    product_name VARCHAR(100),

    category VARCHAR(50),

    price DECIMAL(10,2),

    last_modified_date DATE

);


-- Source Sales Table

CREATE TABLE source_sales (

    sale_id INT PRIMARY KEY,

    customer_id INT,

    product_id INT,

    sale_date DATE,

    quantity INT,

    amount DECIMAL(10,2)

);


-- ===============================================

-- STEP 2: CREATE STAGING TABLES

-- ===============================================


CREATE TABLE staging_customer (

    customer_id INT,

    customer_name VARCHAR(100),

    email VARCHAR(100),

    address VARCHAR(200),

    phone VARCHAR(20),

    last_modified_date DATE

);


CREATE TABLE staging_sales (

    sale_id INT,

    customer_id INT,

    product_id INT,

    sale_date DATE,

    quantity INT,

    amount DECIMAL(10,2)

);


-- ===============================================

-- STEP 3: CREATE DIMENSION TABLES (SCD TYPE 2)

-- ===============================================


CREATE TABLE dim_customer (

    customer_sk INT PRIMARY KEY AUTO_INCREMENT,  -- Surrogate Key

    customer_id INT NOT NULL,                     -- Business Key

    customer_name VARCHAR(100),

    email VARCHAR(100),

    address VARCHAR(200),

    phone VARCHAR(20),

    effective_date DATE NOT NULL,

    end_date DATE,

    is_current BOOLEAN NOT NULL DEFAULT 1,

    created_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,

    INDEX idx_business_key (customer_id, is_current),

    INDEX idx_date_range (customer_id, effective_date, end_date)

);


CREATE TABLE dim_product (

    product_sk INT PRIMARY KEY AUTO_INCREMENT,

    product_id INT NOT NULL,

    product_name VARCHAR(100),

    category VARCHAR(50),

    price DECIMAL(10,2),

    effective_date DATE NOT NULL,

    end_date DATE,

    is_current BOOLEAN NOT NULL DEFAULT 1,

    created_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,

    INDEX idx_business_key (product_id, is_current)

);


-- Date Dimension (Type 1 - no history needed)

CREATE TABLE dim_date (

    date_sk INT PRIMARY KEY,

    full_date DATE,

    year INT,

    quarter INT,

    month INT,

    day INT,

    day_of_week VARCHAR(20)

);


-- ===============================================

-- STEP 4: CREATE FACT TABLE

-- ===============================================


CREATE TABLE fact_sales (

    sale_id INT PRIMARY KEY,

    customer_sk INT NOT NULL,           -- Links to dimension surrogate key

    product_sk INT NOT NULL,             -- Links to dimension surrogate key

    date_sk INT NOT NULL,

    quantity INT,

    amount DECIMAL(10,2),

    load_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,

    FOREIGN KEY (customer_sk) REFERENCES dim_customer(customer_sk),

    FOREIGN KEY (product_sk) REFERENCES dim_product(product_sk),

    FOREIGN KEY (date_sk) REFERENCES dim_date(date_sk),

    INDEX idx_customer (customer_sk),

    INDEX idx_product (product_sk),

    INDEX idx_date (date_sk)

);


-- ===============================================

-- STEP 5: INSERT INITIAL SOURCE DATA

-- ===============================================


-- Initial Customer Data (Jan 2024)

INSERT INTO source_customer VALUES

(101, 'John Smith', 'john@email.com', '123 Oak Street, Boston', '555-0101', '2024-01-01'),

(102, 'Mary Johnson', 'mary@email.com', '456 Elm Avenue, NYC', '555-0102', '2024-01-01'),

(103, 'Bob Williams', 'bob@email.com', '789 Pine Road, Chicago', '555-0103', '2024-01-01');


-- Initial Product Data

INSERT INTO source_product VALUES

(201, 'Laptop Pro', 'Electronics', 1200.00, '2024-01-01'),

(202, 'Wireless Mouse', 'Electronics', 25.00, '2024-01-01'),

(203, 'Office Chair', 'Furniture', 350.00, '2024-01-01');


-- Initial Sales (Jan-Mar 2024)

INSERT INTO source_sales VALUES

(1001, 101, 201, '2024-01-15', 1, 1200.00),

(1002, 102, 202, '2024-02-10', 2, 50.00),

(1003, 101, 203, '2024-03-05', 1, 350.00),

(1004, 103, 201, '2024-03-20', 1, 1200.00);


-- ===============================================

-- STEP 6: INITIAL DIMENSION LOAD (FIRST RUN)

-- ===============================================


-- Load dim_customer - Initial Load

INSERT INTO dim_customer (customer_id, customer_name, email, address, phone, effective_date, end_date, is_current)

SELECT 

    customer_id,

    customer_name,

    email,

    address,

    phone,

    last_modified_date as effective_date,

    NULL as end_date,

    1 as is_current

FROM source_customer;


-- Load dim_product - Initial Load

INSERT INTO dim_product (product_id, product_name, category, price, effective_date, end_date, is_current)

SELECT 

    product_id,

    product_name,

    category,

    price,

    last_modified_date as effective_date,

    NULL as end_date,

    1 as is_current

FROM source_product;


-- Load dim_date (simplified - just the dates we need)

INSERT INTO dim_date VALUES

(20240115, '2024-01-15', 2024, 1, 1, 15, 'Monday'),

(20240210, '2024-02-10', 2024, 1, 2, 10, 'Saturday'),

(20240305, '2024-03-05', 2024, 1, 3, 5, 'Tuesday'),

(20240320, '2024-03-20', 2024, 1, 3, 20, 'Wednesday'),

(20240725, '2024-07-25', 2024, 3, 7, 25, 'Thursday'),

(20240815, '2024-08-15', 2024, 3, 8, 15, 'Thursday');


-- Insert default/unknown records

INSERT INTO dim_customer (customer_sk, customer_id, customer_name, email, address, phone, effective_date, is_current) VALUES

(-1, -1, 'Unknown', 'unknown@unknown.com', 'Unknown', 'Unknown', '1900-01-01', 1);


INSERT INTO dim_product (product_sk, product_id, product_name, category, price, effective_date, is_current) VALUES

(-1, -1, 'Unknown', 'Unknown', 0.00, '1900-01-01', 1);


-- ===============================================

-- STEP 7: INITIAL FACT TABLE LOAD

-- ===============================================


-- Load fact_sales with surrogate key lookups

INSERT INTO fact_sales (sale_id, customer_sk, product_sk, date_sk, quantity, amount)

SELECT 

    s.sale_id,

    COALESCE(dc.customer_sk, -1) as customer_sk,  -- Get surrogate key

    COALESCE(dp.product_sk, -1) as product_sk,    -- Get surrogate key

    CAST(DATE_FORMAT(s.sale_date, '%Y%m%d') AS UNSIGNED) as date_sk,

    s.quantity,

    s.amount

FROM source_sales s

LEFT JOIN dim_customer dc 

    ON s.customer_id = dc.customer_id 

    AND dc.is_current = 1                          -- Get current version

LEFT JOIN dim_product dp 

    ON s.product_id = dp.product_id 

    AND dp.is_current = 1;


-- View initial results

SELECT 'Initial Dimension Data' as Step;

SELECT * FROM dim_customer WHERE customer_id > 0;


SELECT 'Initial Fact Data with Dimension Info' as Step;

SELECT 

    f.sale_id,

    dc.customer_name,

    dc.address as customer_address,

    dp.product_name,

    f.amount,

    f.quantity

FROM fact_sales f

JOIN dim_customer dc ON f.customer_sk = dc.customer_sk

JOIN dim_product dp ON f.product_sk = dp.product_sk

ORDER BY f.sale_id;


-- ===============================================

-- STEP 8: SIMULATE CHANGES (July 2024)

-- ===============================================


-- Customer 101 moves to a new address

UPDATE source_customer 

SET address = '999 Maple Drive, Boston', 

    last_modified_date = '2024-07-01'

WHERE customer_id = 101;


-- Customer 102 changes email

UPDATE source_customer 

SET email = 'mary.johnson@newemail.com',

    last_modified_date = '2024-07-01'

WHERE customer_id = 102;


-- Product price change

UPDATE source_product 

SET price = 1150.00,

    last_modified_date = '2024-07-01'

WHERE product_id = 201;


-- New sales AFTER the changes

INSERT INTO source_sales VALUES

(1005, 101, 202, '2024-07-25', 3, 75.00),   -- John at NEW address

(1006, 102, 201, '2024-08-15', 1, 1150.00); -- Mary with NEW email, product at NEW price


-- ===============================================

-- STEP 9: SCD TYPE 2 UPDATE PROCESS

-- ===============================================


-- Extract changes to staging

TRUNCATE staging_customer;

INSERT INTO staging_customer 

SELECT * FROM source_customer;


-- Process SCD Type 2 for dim_customer

-- Step 9a: Identify changed records

CREATE TEMPORARY TABLE changed_customers AS

SELECT 

    s.customer_id,

    s.customer_name,

    s.email,

    s.address,

    s.phone,

    s.last_modified_date

FROM staging_customer s

JOIN dim_customer d 

    ON s.customer_id = d.customer_id 

    AND d.is_current = 1

WHERE 

    s.customer_name != d.customer_name 

    OR s.email != d.email 

    OR s.address != d.address 

    OR s.phone != d.phone;


-- Step 9b: Expire old records (set is_current = 0, add end_date)

UPDATE dim_customer d

JOIN changed_customers c ON d.customer_id = c.customer_id

SET 

    d.is_current = 0,

    d.end_date = DATE_SUB(c.last_modified_date, INTERVAL 1 DAY)

WHERE d.is_current = 1;


-- Step 9c: Insert new versions

INSERT INTO dim_customer (customer_id, customer_name, email, address, phone, effective_date, end_date, is_current)

SELECT 

    customer_id,

    customer_name,

    email,

    address,

    phone,

    last_modified_date as effective_date,

    NULL as end_date,

    1 as is_current

FROM changed_customers;


-- Similar process for products

CREATE TEMPORARY TABLE changed_products AS

SELECT 

    s.product_id,

    s.product_name,

    s.category,

    s.price,

    s.last_modified_date

FROM source_product s

JOIN dim_product d 

    ON s.product_id = d.product_id 

    AND d.is_current = 1

WHERE 

    s.product_name != d.product_name 

    OR s.category != d.category 

    OR s.price != d.price;


UPDATE dim_product d

JOIN changed_products c ON d.product_id = c.product_id

SET 

    d.is_current = 0,

    d.end_date = DATE_SUB(c.last_modified_date, INTERVAL 1 DAY)

WHERE d.is_current = 1;


INSERT INTO dim_product (product_id, product_name, category, price, effective_date, end_date, is_current)

SELECT 

    product_id,

    product_name,

    category,

    price,

    last_modified_date as effective_date,

    NULL as end_date,

    1 as is_current

FROM changed_products;


-- ===============================================

-- STEP 10: LOAD NEW FACT RECORDS

-- ===============================================


-- Load new sales with CURRENT surrogate keys

INSERT INTO fact_sales (sale_id, customer_sk, product_sk, date_sk, quantity, amount)

SELECT 

    s.sale_id,

    COALESCE(dc.customer_sk, -1) as customer_sk,

    COALESCE(dp.product_sk, -1) as product_sk,

    CAST(DATE_FORMAT(s.sale_date, '%Y%m%d') AS UNSIGNED) as date_sk,

    s.quantity,

    s.amount

FROM source_sales s

LEFT JOIN dim_customer dc 

    ON s.customer_id = dc.customer_id 

    AND dc.is_current = 1                    -- Gets NEW version!

LEFT JOIN dim_product dp 

    ON s.product_id = dp.product_id 

    AND dp.is_current = 1

WHERE s.sale_id NOT IN (SELECT sale_id FROM fact_sales);


-- ===============================================

-- STEP 11: VERIFICATION QUERIES

-- ===============================================


-- Show SCD Type 2 in action - Customer History

SELECT 

    'Customer 101 History - Shows Both Versions' as Description,

    customer_sk,

    customer_id,

    customer_name,

    address,

    effective_date,

    end_date,

    is_current

FROM dim_customer

WHERE customer_id = 101

ORDER BY effective_date;


-- Show Product History

SELECT 

    'Product 201 History - Price Change' as Description,

    product_sk,

    product_id,

    product_name,

    price,

    effective_date,

    end_date,

    is_current

FROM dim_product

WHERE product_id = 201

ORDER BY effective_date;


-- Show ALL sales with historical accuracy

SELECT 

    'All Sales with Historical Context' as Description,

    f.sale_id,

    f.date_sk as sale_date,

    dc.customer_name,

    dc.address as customer_address_at_sale_time,

    dc.effective_date as customer_version_from,

    dc.end_date as customer_version_to,

    dp.product_name,

    dp.price as product_price_at_sale_time,

    f.quantity,

    f.amount

FROM fact_sales f

JOIN dim_customer dc ON f.customer_sk = dc.customer_sk

JOIN dim_product dp ON f.product_sk = dp.product_sk

ORDER BY f.sale_id;


-- Show customer's sales across address changes

SELECT 

    'Customer 101 Sales - Different Addresses at Different Times' as Description,

    f.sale_id,

    f.date_sk as sale_date,

    dc.address as address_at_time_of_sale,

    dc.is_current as is_current_address,

    dp.product_name,

    f.amount

FROM fact_sales f

JOIN dim_customer dc ON f.customer_sk = dc.customer_sk

JOIN dim_product dp ON f.product_sk = dp.product_sk

WHERE dc.customer_id = 101

ORDER BY f.sale_id;


-- Analytical Query: Sales by Customer (all versions aggregated by business key)

SELECT 

    'Sales Summary by Customer (All Versions)' as Description,

    dc.customer_id,

    MAX(CASE WHEN dc.is_current = 1 THEN dc.customer_name END) as current_name,

    COUNT(DISTINCT f.sale_id) as total_sales_count,

    SUM(f.amount) as total_sales_amount

FROM fact_sales f

JOIN dim_customer dc ON f.customer_sk = dc.customer_sk

GROUP BY dc.customer_id

ORDER BY total_sales_amount DESC;


-- Show current view (most common query pattern)

SELECT 

    'Current Snapshot - Latest Customer Info with All Sales' as Description,

    curr_dc.customer_name,

    curr_dc.address as current_address,

    COUNT(f.sale_id) as total_sales,

    SUM(f.amount) as total_amount

FROM fact_sales f

JOIN dim_customer dc ON f.customer_sk = dc.customer_sk  -- Historical join

JOIN dim_customer curr_dc 

    ON dc.customer_id = curr_dc.customer_id 

    AND curr_dc.is_current = 1                          -- Current info

GROUP BY curr_dc.customer_sk, curr_dc.customer_name, curr_dc.address;


-- ===============================================

-- CLEANUP TEMPORARY TABLES

-- ===============================================

DROP TEMPORARY TABLE IF EXISTS changed_customers;

DROP TEMPORARY TABLE IF EXISTS changed_products;

No comments:

Post a Comment