dbbackup/create_large_test_db.sh

#!/bin/bash

# Large Test Database Creator - 50GB with Blobs
# Creates a substantial database for testing backup/restore performance on large datasets

set -e

DB_NAME="testdb_large_50gb"
TARGET_SIZE_GB=50
CHUNK_SIZE_MB=10  # Size of each blob chunk in MB
TOTAL_CHUNKS=$((TARGET_SIZE_GB * 1024 / CHUNK_SIZE_MB))  # Total number of chunks needed

echo "=================================================="
echo "Creating Large Test Database: $DB_NAME"
echo "Target Size: ${TARGET_SIZE_GB}GB"
echo "Chunk Size: ${CHUNK_SIZE_MB}MB"
echo "Total Chunks: $TOTAL_CHUNKS"
echo "=================================================="

# Check available space
AVAILABLE_GB=$(df / | tail -1 | awk '{print int($4/1024/1024)}')
echo "Available disk space: ${AVAILABLE_GB}GB"

if [ $AVAILABLE_GB -lt $((TARGET_SIZE_GB + 10)) ]; then
    echo "❌ ERROR: Insufficient disk space. Need at least $((TARGET_SIZE_GB + 10))GB"
    exit 1
fi

echo "✅ Sufficient disk space available"

# Database connection settings
PGUSER="postgres"
PGHOST="localhost"
PGPORT="5432"

echo ""
echo "1. Creating database and schema..."

# Drop and recreate database
sudo -u postgres psql -c "DROP DATABASE IF EXISTS $DB_NAME;" 2>/dev/null || true
sudo -u postgres psql -c "CREATE DATABASE $DB_NAME;"

# Create tables with different data types
sudo -u postgres psql -d $DB_NAME << 'EOF'
-- Table for large binary objects (blobs)
CREATE TABLE large_blobs (
    id SERIAL PRIMARY KEY,
    name VARCHAR(255),
    description TEXT,
    blob_data BYTEA,
    created_at TIMESTAMP DEFAULT NOW(),
    size_mb INTEGER
);

-- Table for structured data with indexes
CREATE TABLE test_data (
    id SERIAL PRIMARY KEY,
    user_id INTEGER NOT NULL,
    username VARCHAR(100) NOT NULL,
    email VARCHAR(255) NOT NULL,
    profile_data JSONB,
    large_text TEXT,
    random_number NUMERIC(15,2),
    created_at TIMESTAMP DEFAULT NOW()
);

-- Table for time series data (lots of rows)
CREATE TABLE metrics (
    id BIGSERIAL PRIMARY KEY,
    timestamp TIMESTAMP NOT NULL,
    metric_name VARCHAR(100) NOT NULL,
    value DOUBLE PRECISION NOT NULL,
    tags JSONB,
    metadata TEXT
);

-- Indexes for performance
CREATE INDEX idx_test_data_user_id ON test_data(user_id);
CREATE INDEX idx_test_data_email ON test_data(email);
CREATE INDEX idx_test_data_created ON test_data(created_at);
CREATE INDEX idx_metrics_timestamp ON metrics(timestamp);
CREATE INDEX idx_metrics_name ON metrics(metric_name);
CREATE INDEX idx_metrics_tags ON metrics USING GIN(tags);

-- Large text table for document storage
CREATE TABLE documents (
    id SERIAL PRIMARY KEY,
    title VARCHAR(500),
    content TEXT,
    document_data BYTEA,
    tags TEXT[],
    created_at TIMESTAMP DEFAULT NOW()
);

CREATE INDEX idx_documents_tags ON documents USING GIN(tags);
EOF

echo "✅ Database schema created"

echo ""
echo "2. Generating large blob data..."

# Function to generate random data
generate_blob_data() {
    local chunk_num=$1
    local size_mb=$2

    # Generate random binary data using dd and base64
    dd if=/dev/urandom bs=1M count=$size_mb 2>/dev/null | base64 -w 0
}

echo "Inserting $TOTAL_CHUNKS blob chunks of ${CHUNK_SIZE_MB}MB each..."

# Insert blob data in chunks
for i in $(seq 1 $TOTAL_CHUNKS); do
    echo -n "  Progress: $i/$TOTAL_CHUNKS ($(($i * 100 / $TOTAL_CHUNKS))%) - "

    # Generate blob data
    BLOB_DATA=$(generate_blob_data $i $CHUNK_SIZE_MB)

    # Insert into database
    sudo -u postgres psql -d $DB_NAME -c "
        INSERT INTO large_blobs (name, description, blob_data, size_mb)
        VALUES (
            'blob_chunk_$i',
            'Large binary data chunk $i of $TOTAL_CHUNKS for testing backup/restore performance',
            decode('$BLOB_DATA', 'base64'),
            $CHUNK_SIZE_MB
        );" > /dev/null

    echo "✅ Chunk $i inserted"

    # Every 10 chunks, show current database size
    if [ $((i % 10)) -eq 0 ]; then
        CURRENT_SIZE=$(sudo -u postgres psql -d $DB_NAME -tAc "
            SELECT pg_size_pretty(pg_database_size('$DB_NAME'));" 2>/dev/null || echo "Unknown")
        echo "    Current database size: $CURRENT_SIZE"
    fi
done

echo ""
echo "3. Generating structured test data..."

# Insert large amounts of structured data
sudo -u postgres psql -d $DB_NAME << 'EOF'
-- Insert 1 million rows of test data (will add significant size)
INSERT INTO test_data (user_id, username, email, profile_data, large_text, random_number)
SELECT
    generate_series % 100000 as user_id,
    'user_' || generate_series as username,
    'user_' || generate_series || '@example.com' as email,
    ('{"preferences": {"theme": "dark", "language": "en", "notifications": true}, "metadata": {"last_login": "2024-01-01", "session_count": ' || (generate_series % 1000) || ', "data": "' || repeat('x', 100) || '"}}')::jsonb as profile_data,
    repeat('This is large text content for testing. ', 50) || ' Row: ' || generate_series as large_text,
    random() * 1000000 as random_number
FROM generate_series(1, 1000000);

-- Insert time series data (2 million rows)
INSERT INTO metrics (timestamp, metric_name, value, tags, metadata)
SELECT
    NOW() - (generate_series || ' minutes')::interval as timestamp,
    CASE (generate_series % 5)
        WHEN 0 THEN 'cpu_usage'
        WHEN 1 THEN 'memory_usage'
        WHEN 2 THEN 'disk_io'
        WHEN 3 THEN 'network_tx'
        ELSE 'network_rx'
    END as metric_name,
    random() * 100 as value,
    ('{"host": "server_' || (generate_series % 100) || '", "env": "' ||
     CASE (generate_series % 3) WHEN 0 THEN 'prod' WHEN 1 THEN 'staging' ELSE 'dev' END ||
     '", "region": "us-' || CASE (generate_series % 2) WHEN 0 THEN 'east' ELSE 'west' END || '"}')::jsonb as tags,
    'Generated metric data for testing - ' || repeat('metadata_', 10) as metadata
FROM generate_series(1, 2000000);

-- Insert document data with embedded binary content
INSERT INTO documents (title, content, document_data, tags)
SELECT
    'Document ' || generate_series as title,
    repeat('This is document content with lots of text to increase database size. ', 100) ||
    ' Document ID: ' || generate_series || '. ' ||
    repeat('Additional content to make documents larger. ', 20) as content,
    decode(encode(('Binary document data for doc ' || generate_series || ': ' || repeat('BINARY_DATA_', 1000))::bytea, 'base64'), 'base64') as document_data,
    ARRAY['tag_' || (generate_series % 10), 'category_' || (generate_series % 5), 'type_document'] as tags
FROM generate_series(1, 100000);
EOF

echo "✅ Structured data inserted"

echo ""
echo "4. Final database statistics..."

# Get final database size and statistics
sudo -u postgres psql -d $DB_NAME << 'EOF'
SELECT
    'Database Size' as metric,
    pg_size_pretty(pg_database_size(current_database())) as value
UNION ALL
SELECT
    'Table: large_blobs',
    pg_size_pretty(pg_total_relation_size('large_blobs'))
UNION ALL
SELECT
    'Table: test_data',
    pg_size_pretty(pg_total_relation_size('test_data'))
UNION ALL
SELECT
    'Table: metrics',
    pg_size_pretty(pg_total_relation_size('metrics'))
UNION ALL
SELECT
    'Table: documents',
    pg_size_pretty(pg_total_relation_size('documents'));

-- Row counts
SELECT 'large_blobs rows' as table_name, COUNT(*) as row_count FROM large_blobs
UNION ALL
SELECT 'test_data rows', COUNT(*) FROM test_data
UNION ALL
SELECT 'metrics rows', COUNT(*) FROM metrics
UNION ALL
SELECT 'documents rows', COUNT(*) FROM documents;
EOF

echo ""
echo "=================================================="
echo "✅ Large test database creation completed!"
echo "Database: $DB_NAME"
echo "=================================================="

# Show final size
FINAL_SIZE=$(sudo -u postgres psql -d $DB_NAME -tAc "SELECT pg_size_pretty(pg_database_size('$DB_NAME'));" 2>/dev/null)
echo "Final database size: $FINAL_SIZE"

echo ""
echo "You can now test backup/restore operations:"
echo "  # Backup the large database"
echo "  sudo -u postgres ./dbbackup backup single $DB_NAME"
echo ""
echo "  # Backup entire cluster (including this large DB)"
echo "  sudo -u postgres ./dbbackup backup cluster"
echo ""
echo "  # Check database size anytime:"
echo "  sudo -u postgres psql -d $DB_NAME -c \"SELECT pg_size_pretty(pg_database_size('$DB_NAME'));\""