***********************************************************************************
-- ==========================================
-- Q1: Compute cumulative revenue per day using Delta tables
-- ==========================================
CREATE TABLE RevenueDaily (
product_id INT,
revenue INT,
txn_date DATE
);
INSERT INTO RevenueDaily VALUES
(1, 100, '2024-01-01'),
(1, 200, '2024-01-02'),
(1, 150, '2024-01-03'),
(1, 180, '2024-01-04');
SELECT product_id, txn_date, SUM(revenue) OVER (PARTITION BY product_id ORDER BY txn_date) AS cumulative_revenue
FROM RevenueDaily;
-- ==========================================
-- Q2: Retrieve the first and last login per user from event log table
-- ==========================================
CREATE TABLE EventLog (
user_id INT,
login_date DATE
);
INSERT INTO EventLog VALUES
(1, '2024-01-01'), (1, '2024-01-02'), (1, '2024-01-03'), (2, '2024-01-01'), (2, '2024-01-04');
SELECT user_id,
MIN(login_date) AS first_login,
MAX(login_date) AS last_login
FROM EventLog
GROUP BY user_id;
-- ==========================================
-- Q3: Find top 3 customers by spend in each region
-- ==========================================
CREATE TABLE Customers (
customer_id INT,
name VARCHAR(50),
region VARCHAR(50)
);
CREATE TABLE Orders (
order_id INT,
customer_id INT,
amount INT,
order_date DATE
);
INSERT INTO Customers VALUES
(1, 'Alice', 'North'), (2, 'Bob', 'North'), (3, 'Cathy', 'South'), (4, 'David', 'South');
INSERT INTO Orders VALUES
(101, 1, 500, '2024-01-01'), (102, 2, 600, '2024-01-02'), (103, 3, 700, '2024-02-01'), (104, 4, 300, '2024-03-01');
SELECT * FROM (
SELECT c.region, o.customer_id, SUM(o.amount) AS total_spend,
RANK() OVER (PARTITION BY c.region ORDER BY SUM(o.amount) DESC) AS rk
FROM Orders o
JOIN Customers c ON o.customer_id = c.customer_id
GROUP BY c.region, o.customer_id
) t
WHERE rk <= 3;
-- ==========================================
-- Q4: Detect duplicate records and delete extras safely
-- ==========================================
CREATE TABLE Transactions (
txn_id INT,
user_id INT,
amount INT,
txn_date DATE
);
INSERT INTO Transactions VALUES
(1, 1, 100, '2024-01-01'),
(2, 2, 400, '2024-01-15'),
(3, 2, 400, '2024-01-15');
WITH RankedTxns AS (
SELECT *, ROW_NUMBER() OVER (PARTITION BY user_id, amount, txn_date ORDER BY txn_id) AS rn
FROM Transactions
)
DELETE FROM Transactions
WHERE txn_id IN (SELECT txn_id FROM RankedTxns WHERE rn > 1);
-- ==========================================
-- Q5: Get users who made purchases in 3 consecutive months
-- ==========================================
SELECT user_id
FROM (
SELECT user_id,
DATE_TRUNC('month', txn_date) AS txn_month
FROM Transactions
GROUP BY user_id, DATE_TRUNC('month', txn_date)
) t
WINDOW w AS (PARTITION BY user_id ORDER BY txn_month)
QUALIFY txn_month = LAG(txn_month, 1) OVER w + INTERVAL '1 month'
AND txn_month = LAG(txn_month, 2) OVER w + INTERVAL '2 month';
-- ==========================================
-- Q6: Identify skewed joins and propose fixes using broadcast hints
-- ==========================================
-- Assume Orders is large, Regions is small:
-- Use broadcast join:
-- SELECT /*+ BROADCAST(Regions) */ * FROM Orders JOIN Regions ON Orders.region_id = Regions.id;
-- ==========================================
-- Q7: Compute 7-day moving average of product sales
-- ==========================================
SELECT product_id, txn_date,
AVG(revenue) OVER (PARTITION BY product_id ORDER BY txn_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS moving_avg
FROM RevenueDaily;
-- ==========================================
-- Q8: Pivot daily sales into month-wise columns
-- ==========================================
CREATE TABLE Sales (
product_id INT,
sale_date DATE,
quantity INT
);
INSERT INTO Sales VALUES
(1, '2024-01-01', 10), (1, '2024-01-02', 5), (2, '2024-01-01', 1);
SELECT product_id,
SUM(CASE WHEN MONTH(sale_date) = 1 THEN quantity ELSE 0 END) AS Jan,
SUM(CASE WHEN MONTH(sale_date) = 2 THEN quantity ELSE 0 END) AS Feb
FROM Sales
GROUP BY product_id;
-- ==========================================
-- Q9: Customers who bought products every month in a year
-- ==========================================
SELECT customer_id
FROM Orders
WHERE YEAR(order_date) = 2024
GROUP BY customer_id
HAVING COUNT(DISTINCT MONTH(order_date)) = 12;
-- ==========================================
-- Q10: Rank products by sales per year
-- ==========================================
SELECT product_id, YEAR(sale_date) AS yr, SUM(quantity) AS total_qty,
RANK() OVER (PARTITION BY YEAR(sale_date) ORDER BY SUM(quantity) DESC) AS rnk
FROM Sales
GROUP BY product_id, YEAR(sale_date);
-- ==========================================
-- Q11: Employees earning more than their department average
-- ==========================================
CREATE TABLE Employee (
emp_id INT,
name VARCHAR(50),
dept_id INT,
salary INT
);
INSERT INTO Employee VALUES
(1, 'John', 101, 100000),
(2, 'Jane', 101, 110000),
(3, 'Dave', 102, 95000);
SELECT *
FROM Employee e
JOIN (
SELECT dept_id, AVG(salary) AS avg_sal
FROM Employee
GROUP BY dept_id
) d ON e.dept_id = d.dept_id
WHERE e.salary > d.avg_sal;
-- ==========================================
-- Q12: Median transaction amount (no built-in median)
-- ==========================================
SELECT AVG(amount) AS median_val
FROM (
SELECT amount, ROW_NUMBER() OVER (ORDER BY amount) AS rn,
COUNT(*) OVER () AS cnt
FROM Transactions
) t
WHERE rn IN (FLOOR((cnt+1)/2.0), CEIL((cnt+1)/2.0));
-- ==========================================
-- Q13: Users who placed their first order in last 30 days
-- ==========================================
CREATE TABLE UserOrders (
user_id INT,
order_id INT,
order_date DATE
);
INSERT INTO UserOrders VALUES
(1, 1001, CURRENT_DATE - INTERVAL '10' DAY),
(2, 1002, CURRENT_DATE - INTERVAL '45' DAY);
SELECT user_id
FROM (
SELECT user_id, MIN(order_date) AS first_order
FROM UserOrders
GROUP BY user_id
) t
WHERE first_order >= CURRENT_DATE - INTERVAL '30' DAY;
-- ==========================================
-- Q14: Compare price change between two dates per product
-- ==========================================
CREATE TABLE ProductPrices (
product_id INT,
price_date DATE,
price INT
);
INSERT INTO ProductPrices VALUES
(1, '2024-01-01', 200), (1, '2024-02-01', 210);
SELECT product_id,
MAX(CASE WHEN price_date = '2024-01-01' THEN price END) AS old_price,
MAX(CASE WHEN price_date = '2024-02-01' THEN price END) AS new_price
FROM ProductPrices
GROUP BY product_id;
-- ==========================================
-- Q15: Customers whose first and last transaction is same day
-- ==========================================
SELECT user_id
FROM Transactions
GROUP BY user_id
HAVING MIN(txn_date) = MAX(txn_date);
-- ==========================================
-- Q16: Percentage of returning users per month
-- ==========================================
SELECT EXTRACT(MONTH FROM order_date) AS mth,
COUNT(DISTINCT user_id) FILTER (WHERE is_returning) * 100.0 /
COUNT(DISTINCT user_id) AS returning_pct
FROM (
SELECT user_id, order_date,
LAG(order_date) OVER (PARTITION BY user_id ORDER BY order_date) IS NOT NULL AS is_returning
FROM UserOrders
) t
GROUP BY EXTRACT(MONTH FROM order_date);
-- ==========================================
-- Q17: Products never sold
-- ==========================================
CREATE TABLE Products (
product_id INT,
product_name VARCHAR(50)
);
INSERT INTO Products VALUES
(1, 'Phone'), (2, 'TV'), (3, 'Laptop');
SELECT *
FROM Products
WHERE product_id NOT IN (SELECT DISTINCT product_id FROM Sales);
-- ==========================================
-- Q18: Detect schema drift in Delta Lake
-- ==========================================
CREATE TABLE DeltaHistory (
product_id INT,
version INT,
schema_snapshot STRING
);
INSERT INTO DeltaHistory VALUES
(1, 1, '{"product_id": "int", "price": "int"}'),
(1, 2, '{"product_id": "int", "price": "string"}');
SELECT product_id
FROM DeltaHistory
GROUP BY product_id
HAVING COUNT(DISTINCT schema_snapshot) > 1;
-- ==========================================
-- Q19: Departments with at least 2 employees having same salary
-- ==========================================
SELECT dept_id
FROM Employee
GROUP BY dept_id, salary
HAVING COUNT(*) >= 2;
-- ==========================================
-- Q20: Group users by 3+ day login streaks
-- ==========================================
SELECT user_id, MIN(login_date) AS streak_start, MAX(login_date) AS streak_end
FROM (
SELECT user_id, login_date,
login_date - INTERVAL '1 day' * ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY login_date) AS grp
FROM EventLog
) t
GROUP BY user_id, grp
HAVING COUNT(*) >= 3;
Comments
Post a Comment