-- ===============================================
-- COMPLETE SCD TYPE 2 ETL EXAMPLE
-- ===============================================
-- This example shows the complete flow from source to fact tables
-- with SCD Type 2 dimension handling
-- ===============================================
-- STEP 1: CREATE SOURCE TABLES (Simulating OLTP)
-- ===============================================
-- Source Customer Table (Business System)
CREATE TABLE source_customer (
customer_id INT PRIMARY KEY,
customer_name VARCHAR(100),
email VARCHAR(100),
address VARCHAR(200),
phone VARCHAR(20),
last_modified_date DATE
);
-- Source Product Table
CREATE TABLE source_product (
product_id INT PRIMARY KEY,
product_name VARCHAR(100),
category VARCHAR(50),
price DECIMAL(10,2),
last_modified_date DATE
);
-- Source Sales Table
CREATE TABLE source_sales (
sale_id INT PRIMARY KEY,
customer_id INT,
product_id INT,
sale_date DATE,
quantity INT,
amount DECIMAL(10,2)
);
-- ===============================================
-- STEP 2: CREATE STAGING TABLES
-- ===============================================
CREATE TABLE staging_customer (
customer_id INT,
customer_name VARCHAR(100),
email VARCHAR(100),
address VARCHAR(200),
phone VARCHAR(20),
last_modified_date DATE
);
CREATE TABLE staging_sales (
sale_id INT,
customer_id INT,
product_id INT,
sale_date DATE,
quantity INT,
amount DECIMAL(10,2)
);
-- ===============================================
-- STEP 3: CREATE DIMENSION TABLES (SCD TYPE 2)
-- ===============================================
CREATE TABLE dim_customer (
customer_sk INT PRIMARY KEY AUTO_INCREMENT, -- Surrogate Key
customer_id INT NOT NULL, -- Business Key
customer_name VARCHAR(100),
email VARCHAR(100),
address VARCHAR(200),
phone VARCHAR(20),
effective_date DATE NOT NULL,
end_date DATE,
is_current BOOLEAN NOT NULL DEFAULT 1,
created_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
INDEX idx_business_key (customer_id, is_current),
INDEX idx_date_range (customer_id, effective_date, end_date)
);
CREATE TABLE dim_product (
product_sk INT PRIMARY KEY AUTO_INCREMENT,
product_id INT NOT NULL,
product_name VARCHAR(100),
category VARCHAR(50),
price DECIMAL(10,2),
effective_date DATE NOT NULL,
end_date DATE,
is_current BOOLEAN NOT NULL DEFAULT 1,
created_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
INDEX idx_business_key (product_id, is_current)
);
-- Date Dimension (Type 1 - no history needed)
CREATE TABLE dim_date (
date_sk INT PRIMARY KEY,
full_date DATE,
year INT,
quarter INT,
month INT,
day INT,
day_of_week VARCHAR(20)
);
-- ===============================================
-- STEP 4: CREATE FACT TABLE
-- ===============================================
CREATE TABLE fact_sales (
sale_id INT PRIMARY KEY,
customer_sk INT NOT NULL, -- Links to dimension surrogate key
product_sk INT NOT NULL, -- Links to dimension surrogate key
date_sk INT NOT NULL,
quantity INT,
amount DECIMAL(10,2),
load_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (customer_sk) REFERENCES dim_customer(customer_sk),
FOREIGN KEY (product_sk) REFERENCES dim_product(product_sk),
FOREIGN KEY (date_sk) REFERENCES dim_date(date_sk),
INDEX idx_customer (customer_sk),
INDEX idx_product (product_sk),
INDEX idx_date (date_sk)
);
-- ===============================================
-- STEP 5: INSERT INITIAL SOURCE DATA
-- ===============================================
-- Initial Customer Data (Jan 2024)
INSERT INTO source_customer VALUES
(101, 'John Smith', 'john@email.com', '123 Oak Street, Boston', '555-0101', '2024-01-01'),
(102, 'Mary Johnson', 'mary@email.com', '456 Elm Avenue, NYC', '555-0102', '2024-01-01'),
(103, 'Bob Williams', 'bob@email.com', '789 Pine Road, Chicago', '555-0103', '2024-01-01');
-- Initial Product Data
INSERT INTO source_product VALUES
(201, 'Laptop Pro', 'Electronics', 1200.00, '2024-01-01'),
(202, 'Wireless Mouse', 'Electronics', 25.00, '2024-01-01'),
(203, 'Office Chair', 'Furniture', 350.00, '2024-01-01');
-- Initial Sales (Jan-Mar 2024)
INSERT INTO source_sales VALUES
(1001, 101, 201, '2024-01-15', 1, 1200.00),
(1002, 102, 202, '2024-02-10', 2, 50.00),
(1003, 101, 203, '2024-03-05', 1, 350.00),
(1004, 103, 201, '2024-03-20', 1, 1200.00);
-- ===============================================
-- STEP 6: INITIAL DIMENSION LOAD (FIRST RUN)
-- ===============================================
-- Load dim_customer - Initial Load
INSERT INTO dim_customer (customer_id, customer_name, email, address, phone, effective_date, end_date, is_current)
SELECT
customer_id,
customer_name,
email,
address,
phone,
last_modified_date as effective_date,
NULL as end_date,
1 as is_current
FROM source_customer;
-- Load dim_product - Initial Load
INSERT INTO dim_product (product_id, product_name, category, price, effective_date, end_date, is_current)
SELECT
product_id,
product_name,
category,
price,
last_modified_date as effective_date,
NULL as end_date,
1 as is_current
FROM source_product;
-- Load dim_date (simplified - just the dates we need)
INSERT INTO dim_date VALUES
(20240115, '2024-01-15', 2024, 1, 1, 15, 'Monday'),
(20240210, '2024-02-10', 2024, 1, 2, 10, 'Saturday'),
(20240305, '2024-03-05', 2024, 1, 3, 5, 'Tuesday'),
(20240320, '2024-03-20', 2024, 1, 3, 20, 'Wednesday'),
(20240725, '2024-07-25', 2024, 3, 7, 25, 'Thursday'),
(20240815, '2024-08-15', 2024, 3, 8, 15, 'Thursday');
-- Insert default/unknown records
INSERT INTO dim_customer (customer_sk, customer_id, customer_name, email, address, phone, effective_date, is_current) VALUES
(-1, -1, 'Unknown', 'unknown@unknown.com', 'Unknown', 'Unknown', '1900-01-01', 1);
INSERT INTO dim_product (product_sk, product_id, product_name, category, price, effective_date, is_current) VALUES
(-1, -1, 'Unknown', 'Unknown', 0.00, '1900-01-01', 1);
-- ===============================================
-- STEP 7: INITIAL FACT TABLE LOAD
-- ===============================================
-- Load fact_sales with surrogate key lookups
INSERT INTO fact_sales (sale_id, customer_sk, product_sk, date_sk, quantity, amount)
SELECT
s.sale_id,
COALESCE(dc.customer_sk, -1) as customer_sk, -- Get surrogate key
COALESCE(dp.product_sk, -1) as product_sk, -- Get surrogate key
CAST(DATE_FORMAT(s.sale_date, '%Y%m%d') AS UNSIGNED) as date_sk,
s.quantity,
s.amount
FROM source_sales s
LEFT JOIN dim_customer dc
ON s.customer_id = dc.customer_id
AND dc.is_current = 1 -- Get current version
LEFT JOIN dim_product dp
ON s.product_id = dp.product_id
AND dp.is_current = 1;
-- View initial results
SELECT 'Initial Dimension Data' as Step;
SELECT * FROM dim_customer WHERE customer_id > 0;
SELECT 'Initial Fact Data with Dimension Info' as Step;
SELECT
f.sale_id,
dc.customer_name,
dc.address as customer_address,
dp.product_name,
f.amount,
f.quantity
FROM fact_sales f
JOIN dim_customer dc ON f.customer_sk = dc.customer_sk
JOIN dim_product dp ON f.product_sk = dp.product_sk
ORDER BY f.sale_id;
-- ===============================================
-- STEP 8: SIMULATE CHANGES (July 2024)
-- ===============================================
-- Customer 101 moves to a new address
UPDATE source_customer
SET address = '999 Maple Drive, Boston',
last_modified_date = '2024-07-01'
WHERE customer_id = 101;
-- Customer 102 changes email
UPDATE source_customer
SET email = 'mary.johnson@newemail.com',
last_modified_date = '2024-07-01'
WHERE customer_id = 102;
-- Product price change
UPDATE source_product
SET price = 1150.00,
last_modified_date = '2024-07-01'
WHERE product_id = 201;
-- New sales AFTER the changes
INSERT INTO source_sales VALUES
(1005, 101, 202, '2024-07-25', 3, 75.00), -- John at NEW address
(1006, 102, 201, '2024-08-15', 1, 1150.00); -- Mary with NEW email, product at NEW price
-- ===============================================
-- STEP 9: SCD TYPE 2 UPDATE PROCESS
-- ===============================================
-- Extract changes to staging
TRUNCATE staging_customer;
INSERT INTO staging_customer
SELECT * FROM source_customer;
-- Process SCD Type 2 for dim_customer
-- Step 9a: Identify changed records
CREATE TEMPORARY TABLE changed_customers AS
SELECT
s.customer_id,
s.customer_name,
s.email,
s.address,
s.phone,
s.last_modified_date
FROM staging_customer s
JOIN dim_customer d
ON s.customer_id = d.customer_id
AND d.is_current = 1
WHERE
s.customer_name != d.customer_name
OR s.email != d.email
OR s.address != d.address
OR s.phone != d.phone;
-- Step 9b: Expire old records (set is_current = 0, add end_date)
UPDATE dim_customer d
JOIN changed_customers c ON d.customer_id = c.customer_id
SET
d.is_current = 0,
d.end_date = DATE_SUB(c.last_modified_date, INTERVAL 1 DAY)
WHERE d.is_current = 1;
-- Step 9c: Insert new versions
INSERT INTO dim_customer (customer_id, customer_name, email, address, phone, effective_date, end_date, is_current)
SELECT
customer_id,
customer_name,
email,
address,
phone,
last_modified_date as effective_date,
NULL as end_date,
1 as is_current
FROM changed_customers;
-- Similar process for products
CREATE TEMPORARY TABLE changed_products AS
SELECT
s.product_id,
s.product_name,
s.category,
s.price,
s.last_modified_date
FROM source_product s
JOIN dim_product d
ON s.product_id = d.product_id
AND d.is_current = 1
WHERE
s.product_name != d.product_name
OR s.category != d.category
OR s.price != d.price;
UPDATE dim_product d
JOIN changed_products c ON d.product_id = c.product_id
SET
d.is_current = 0,
d.end_date = DATE_SUB(c.last_modified_date, INTERVAL 1 DAY)
WHERE d.is_current = 1;
INSERT INTO dim_product (product_id, product_name, category, price, effective_date, end_date, is_current)
SELECT
product_id,
product_name,
category,
price,
last_modified_date as effective_date,
NULL as end_date,
1 as is_current
FROM changed_products;
-- ===============================================
-- STEP 10: LOAD NEW FACT RECORDS
-- ===============================================
-- Load new sales with CURRENT surrogate keys
INSERT INTO fact_sales (sale_id, customer_sk, product_sk, date_sk, quantity, amount)
SELECT
s.sale_id,
COALESCE(dc.customer_sk, -1) as customer_sk,
COALESCE(dp.product_sk, -1) as product_sk,
CAST(DATE_FORMAT(s.sale_date, '%Y%m%d') AS UNSIGNED) as date_sk,
s.quantity,
s.amount
FROM source_sales s
LEFT JOIN dim_customer dc
ON s.customer_id = dc.customer_id
AND dc.is_current = 1 -- Gets NEW version!
LEFT JOIN dim_product dp
ON s.product_id = dp.product_id
AND dp.is_current = 1
WHERE s.sale_id NOT IN (SELECT sale_id FROM fact_sales);
-- ===============================================
-- STEP 11: VERIFICATION QUERIES
-- ===============================================
-- Show SCD Type 2 in action - Customer History
SELECT
'Customer 101 History - Shows Both Versions' as Description,
customer_sk,
customer_id,
customer_name,
address,
effective_date,
end_date,
is_current
FROM dim_customer
WHERE customer_id = 101
ORDER BY effective_date;
-- Show Product History
SELECT
'Product 201 History - Price Change' as Description,
product_sk,
product_id,
product_name,
price,
effective_date,
end_date,
is_current
FROM dim_product
WHERE product_id = 201
ORDER BY effective_date;
-- Show ALL sales with historical accuracy
SELECT
'All Sales with Historical Context' as Description,
f.sale_id,
f.date_sk as sale_date,
dc.customer_name,
dc.address as customer_address_at_sale_time,
dc.effective_date as customer_version_from,
dc.end_date as customer_version_to,
dp.product_name,
dp.price as product_price_at_sale_time,
f.quantity,
f.amount
FROM fact_sales f
JOIN dim_customer dc ON f.customer_sk = dc.customer_sk
JOIN dim_product dp ON f.product_sk = dp.product_sk
ORDER BY f.sale_id;
-- Show customer's sales across address changes
SELECT
'Customer 101 Sales - Different Addresses at Different Times' as Description,
f.sale_id,
f.date_sk as sale_date,
dc.address as address_at_time_of_sale,
dc.is_current as is_current_address,
dp.product_name,
f.amount
FROM fact_sales f
JOIN dim_customer dc ON f.customer_sk = dc.customer_sk
JOIN dim_product dp ON f.product_sk = dp.product_sk
WHERE dc.customer_id = 101
ORDER BY f.sale_id;
-- Analytical Query: Sales by Customer (all versions aggregated by business key)
SELECT
'Sales Summary by Customer (All Versions)' as Description,
dc.customer_id,
MAX(CASE WHEN dc.is_current = 1 THEN dc.customer_name END) as current_name,
COUNT(DISTINCT f.sale_id) as total_sales_count,
SUM(f.amount) as total_sales_amount
FROM fact_sales f
JOIN dim_customer dc ON f.customer_sk = dc.customer_sk
GROUP BY dc.customer_id
ORDER BY total_sales_amount DESC;
-- Show current view (most common query pattern)
SELECT
'Current Snapshot - Latest Customer Info with All Sales' as Description,
curr_dc.customer_name,
curr_dc.address as current_address,
COUNT(f.sale_id) as total_sales,
SUM(f.amount) as total_amount
FROM fact_sales f
JOIN dim_customer dc ON f.customer_sk = dc.customer_sk -- Historical join
JOIN dim_customer curr_dc
ON dc.customer_id = curr_dc.customer_id
AND curr_dc.is_current = 1 -- Current info
GROUP BY curr_dc.customer_sk, curr_dc.customer_name, curr_dc.address;
-- ===============================================
-- CLEANUP TEMPORARY TABLES
-- ===============================================
DROP TEMPORARY TABLE IF EXISTS changed_customers;
DROP TEMPORARY TABLE IF EXISTS changed_products;