ADVANCED SQL + DELTA LAKE INTERVIEW SET

 interview ( 2+ YOE ) 

Difficulty level = Intermediate

1.⁠ ⁠Write a SQL query using Delta tables to compute cumulative revenue per day

2.⁠ ⁠Retrieve the first and last login per user from a huge event log table

3.⁠ ⁠Find the top 3 customers by spend in each region (use window functions)

4.⁠ ⁠Detect duplicate records in a transactional table and delete extras safely

5.⁠ ⁠Get users who made purchases in 3 consecutive months

6.⁠ ⁠Identify skewed joins in SQL and propose fixes using broadcast hints

7.⁠ ⁠Compute a 7-day moving average of product sales on Delta Lake

8.⁠ ⁠Pivot daily sales into month-wise columns

9.⁠ ⁠Find customers who bought products every month in a year

10.⁠ ⁠Rank products by sales per year, resetting rank each year

11.⁠ ⁠Find employees earning more than their department average

12.⁠ ⁠Find the median transaction amount (no built-in median)

13.⁠ ⁠Get all users who placed their first order in the last 30 days

14.⁠ ⁠Compare price change between two dates for each product

15.⁠ ⁠Identify customers whose first and last transaction is on the same day

16.⁠ ⁠Calculate the percentage of returning users for each month

17.⁠ ⁠Retrieve products that have never been sold

18.⁠ ⁠Detect schema drift in historical Delta data snapshots

19.⁠ ⁠Find departments where at least 2 employees have identical salaries

20.⁠ ⁠Group users by login streaks of 3+ consecutive days


***********************************************************************************
-- ==========================================
-- Q1: Compute cumulative revenue per day using Delta tables
-- ==========================================
CREATE TABLE RevenueDaily (
    product_id INT,
    revenue INT,
    txn_date DATE
);

INSERT INTO RevenueDaily VALUES
(1, 100, '2024-01-01'),
(1, 200, '2024-01-02'),
(1, 150, '2024-01-03'),
(1, 180, '2024-01-04');

SELECT product_id, txn_date, SUM(revenue) OVER (PARTITION BY product_id ORDER BY txn_date) AS cumulative_revenue
FROM RevenueDaily;

-- ==========================================
-- Q2: Retrieve the first and last login per user from event log table
-- ==========================================
CREATE TABLE EventLog (
    user_id INT,
    login_date DATE
);

INSERT INTO EventLog VALUES
(1, '2024-01-01'), (1, '2024-01-02'), (1, '2024-01-03'), (2, '2024-01-01'), (2, '2024-01-04');

SELECT user_id,
       MIN(login_date) AS first_login,
       MAX(login_date) AS last_login
FROM EventLog
GROUP BY user_id;

-- ==========================================
-- Q3: Find top 3 customers by spend in each region
-- ==========================================
CREATE TABLE Customers (
    customer_id INT,
    name VARCHAR(50),
    region VARCHAR(50)
);

CREATE TABLE Orders (
    order_id INT,
    customer_id INT,
    amount INT,
    order_date DATE
);

INSERT INTO Customers VALUES
(1, 'Alice', 'North'), (2, 'Bob', 'North'), (3, 'Cathy', 'South'), (4, 'David', 'South');
INSERT INTO Orders VALUES
(101, 1, 500, '2024-01-01'), (102, 2, 600, '2024-01-02'), (103, 3, 700, '2024-02-01'), (104, 4, 300, '2024-03-01');

SELECT * FROM (
  SELECT c.region, o.customer_id, SUM(o.amount) AS total_spend,
         RANK() OVER (PARTITION BY c.region ORDER BY SUM(o.amount) DESC) AS rk
  FROM Orders o
  JOIN Customers c ON o.customer_id = c.customer_id
  GROUP BY c.region, o.customer_id
) t
WHERE rk <= 3;

-- ==========================================
-- Q4: Detect duplicate records and delete extras safely
-- ==========================================
CREATE TABLE Transactions (
    txn_id INT,
    user_id INT,
    amount INT,
    txn_date DATE
);

INSERT INTO Transactions VALUES
(1, 1, 100, '2024-01-01'),
(2, 2, 400, '2024-01-15'),
(3, 2, 400, '2024-01-15');

WITH RankedTxns AS (
  SELECT *, ROW_NUMBER() OVER (PARTITION BY user_id, amount, txn_date ORDER BY txn_id) AS rn
  FROM Transactions
)
DELETE FROM Transactions
WHERE txn_id IN (SELECT txn_id FROM RankedTxns WHERE rn > 1);

-- ==========================================
-- Q5: Get users who made purchases in 3 consecutive months
-- ==========================================
SELECT user_id
FROM (
  SELECT user_id,
         DATE_TRUNC('month', txn_date) AS txn_month
  FROM Transactions
  GROUP BY user_id, DATE_TRUNC('month', txn_date)
) t
WINDOW w AS (PARTITION BY user_id ORDER BY txn_month)
QUALIFY txn_month = LAG(txn_month, 1) OVER w + INTERVAL '1 month'
     AND txn_month = LAG(txn_month, 2) OVER w + INTERVAL '2 month';

-- ==========================================
-- Q6: Identify skewed joins and propose fixes using broadcast hints
-- ==========================================
-- Assume Orders is large, Regions is small:
-- Use broadcast join:
-- SELECT /*+ BROADCAST(Regions) */ * FROM Orders JOIN Regions ON Orders.region_id = Regions.id;

-- ==========================================
-- Q7: Compute 7-day moving average of product sales
-- ==========================================
SELECT product_id, txn_date,
       AVG(revenue) OVER (PARTITION BY product_id ORDER BY txn_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS moving_avg
FROM RevenueDaily;

-- ==========================================
-- Q8: Pivot daily sales into month-wise columns
-- ==========================================
CREATE TABLE Sales (
    product_id INT,
    sale_date DATE,
    quantity INT
);

INSERT INTO Sales VALUES
(1, '2024-01-01', 10), (1, '2024-01-02', 5), (2, '2024-01-01', 1);

SELECT product_id,
       SUM(CASE WHEN MONTH(sale_date) = 1 THEN quantity ELSE 0 END) AS Jan,
       SUM(CASE WHEN MONTH(sale_date) = 2 THEN quantity ELSE 0 END) AS Feb
FROM Sales
GROUP BY product_id;

-- ==========================================
-- Q9: Customers who bought products every month in a year
-- ==========================================
SELECT customer_id
FROM Orders
WHERE YEAR(order_date) = 2024
GROUP BY customer_id
HAVING COUNT(DISTINCT MONTH(order_date)) = 12;

-- ==========================================
-- Q10: Rank products by sales per year
-- ==========================================
SELECT product_id, YEAR(sale_date) AS yr, SUM(quantity) AS total_qty,
       RANK() OVER (PARTITION BY YEAR(sale_date) ORDER BY SUM(quantity) DESC) AS rnk
FROM Sales
GROUP BY product_id, YEAR(sale_date);

-- ==========================================
-- Q11: Employees earning more than their department average
-- ==========================================
CREATE TABLE Employee (
    emp_id INT,
    name VARCHAR(50),
    dept_id INT,
    salary INT
);

INSERT INTO Employee VALUES
(1, 'John', 101, 100000),
(2, 'Jane', 101, 110000),
(3, 'Dave', 102, 95000);

SELECT *
FROM Employee e
JOIN (
  SELECT dept_id, AVG(salary) AS avg_sal
  FROM Employee
  GROUP BY dept_id
) d ON e.dept_id = d.dept_id
WHERE e.salary > d.avg_sal;

-- ==========================================
-- Q12: Median transaction amount (no built-in median)
-- ==========================================
SELECT AVG(amount) AS median_val
FROM (
  SELECT amount, ROW_NUMBER() OVER (ORDER BY amount) AS rn,
         COUNT(*) OVER () AS cnt
  FROM Transactions
) t
WHERE rn IN (FLOOR((cnt+1)/2.0), CEIL((cnt+1)/2.0));

-- ==========================================
-- Q13: Users who placed their first order in last 30 days
-- ==========================================
CREATE TABLE UserOrders (
    user_id INT,
    order_id INT,
    order_date DATE
);

INSERT INTO UserOrders VALUES
(1, 1001, CURRENT_DATE - INTERVAL '10' DAY),
(2, 1002, CURRENT_DATE - INTERVAL '45' DAY);

SELECT user_id
FROM (
  SELECT user_id, MIN(order_date) AS first_order
  FROM UserOrders
  GROUP BY user_id
) t
WHERE first_order >= CURRENT_DATE - INTERVAL '30' DAY;

-- ==========================================
-- Q14: Compare price change between two dates per product
-- ==========================================
CREATE TABLE ProductPrices (
    product_id INT,
    price_date DATE,
    price INT
);

INSERT INTO ProductPrices VALUES
(1, '2024-01-01', 200), (1, '2024-02-01', 210);

SELECT product_id,
       MAX(CASE WHEN price_date = '2024-01-01' THEN price END) AS old_price,
       MAX(CASE WHEN price_date = '2024-02-01' THEN price END) AS new_price
FROM ProductPrices
GROUP BY product_id;

-- ==========================================
-- Q15: Customers whose first and last transaction is same day
-- ==========================================
SELECT user_id
FROM Transactions
GROUP BY user_id
HAVING MIN(txn_date) = MAX(txn_date);

-- ==========================================
-- Q16: Percentage of returning users per month
-- ==========================================
SELECT EXTRACT(MONTH FROM order_date) AS mth,
       COUNT(DISTINCT user_id) FILTER (WHERE is_returning) * 100.0 /
       COUNT(DISTINCT user_id) AS returning_pct
FROM (
  SELECT user_id, order_date,
         LAG(order_date) OVER (PARTITION BY user_id ORDER BY order_date) IS NOT NULL AS is_returning
  FROM UserOrders
) t
GROUP BY EXTRACT(MONTH FROM order_date);

-- ==========================================
-- Q17: Products never sold
-- ==========================================
CREATE TABLE Products (
    product_id INT,
    product_name VARCHAR(50)
);

INSERT INTO Products VALUES
(1, 'Phone'), (2, 'TV'), (3, 'Laptop');

SELECT *
FROM Products
WHERE product_id NOT IN (SELECT DISTINCT product_id FROM Sales);

-- ==========================================
-- Q18: Detect schema drift in Delta Lake
-- ==========================================
CREATE TABLE DeltaHistory (
    product_id INT,
    version INT,
    schema_snapshot STRING
);

INSERT INTO DeltaHistory VALUES
(1, 1, '{"product_id": "int", "price": "int"}'),
(1, 2, '{"product_id": "int", "price": "string"}');

SELECT product_id
FROM DeltaHistory
GROUP BY product_id
HAVING COUNT(DISTINCT schema_snapshot) > 1;

-- ==========================================
-- Q19: Departments with at least 2 employees having same salary
-- ==========================================
SELECT dept_id
FROM Employee
GROUP BY dept_id, salary
HAVING COUNT(*) >= 2;

-- ==========================================
-- Q20: Group users by 3+ day login streaks
-- ==========================================
SELECT user_id, MIN(login_date) AS streak_start, MAX(login_date) AS streak_end
FROM (
  SELECT user_id, login_date,
         login_date - INTERVAL '1 day' * ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY login_date) AS grp
  FROM EventLog
) t
GROUP BY user_id, grp
HAVING COUNT(*) >= 3;

Comments

Popular posts from this blog

SyBase Database Migration to SQL Server

Basics of US Healthcare -Medical Billing