Files
dbbackup/create_50gb_db_optimized.sh

255 lines
8.8 KiB
Bash
Executable File

#!/bin/bash
# Optimized Large Database Creator - 50GB target
# More efficient approach using PostgreSQL's built-in functions
set -e
DB_NAME="testdb_50gb"
TARGET_SIZE_GB=50
echo "=================================================="
echo "OPTIMIZED Large Test Database Creator"
echo "Database: $DB_NAME"
echo "Target Size: ${TARGET_SIZE_GB}GB"
echo "=================================================="
# Check available space
AVAILABLE_GB=$(df / | tail -1 | awk '{print int($4/1024/1024)}')
echo "Available disk space: ${AVAILABLE_GB}GB"
if [ $AVAILABLE_GB -lt $((TARGET_SIZE_GB + 20)) ]; then
echo "❌ ERROR: Insufficient disk space. Need at least $((TARGET_SIZE_GB + 20))GB buffer"
exit 1
fi
echo "✅ Sufficient disk space available"
echo ""
echo "1. Creating optimized database schema..."
# Drop and recreate database
sudo -u postgres psql -c "DROP DATABASE IF EXISTS $DB_NAME;" 2>/dev/null || true
sudo -u postgres psql -c "CREATE DATABASE $DB_NAME;"
# Create optimized schema for rapid data generation
sudo -u postgres psql -d $DB_NAME << 'EOF'
-- Large blob table with efficient storage
CREATE TABLE mega_blobs (
id BIGSERIAL PRIMARY KEY,
chunk_id INTEGER NOT NULL,
blob_data BYTEA NOT NULL,
created_at TIMESTAMP DEFAULT NOW()
);
-- Massive text table for document storage
CREATE TABLE big_documents (
id BIGSERIAL PRIMARY KEY,
doc_name VARCHAR(100),
content TEXT NOT NULL,
metadata JSONB,
created_at TIMESTAMP DEFAULT NOW()
);
-- High-volume metrics table
CREATE TABLE huge_metrics (
id BIGSERIAL PRIMARY KEY,
timestamp TIMESTAMP NOT NULL,
sensor_id INTEGER NOT NULL,
metric_type VARCHAR(50) NOT NULL,
value_data TEXT NOT NULL, -- Large text field
binary_payload BYTEA,
created_at TIMESTAMP DEFAULT NOW()
);
-- Indexes for realism
CREATE INDEX idx_mega_blobs_chunk ON mega_blobs(chunk_id);
CREATE INDEX idx_big_docs_name ON big_documents(doc_name);
CREATE INDEX idx_huge_metrics_timestamp ON huge_metrics(timestamp);
CREATE INDEX idx_huge_metrics_sensor ON huge_metrics(sensor_id);
EOF
echo "✅ Optimized schema created"
echo ""
echo "2. Generating large-scale data using PostgreSQL's generate_series..."
# Strategy: Use PostgreSQL's efficient bulk operations
echo "Inserting massive text documents (targeting ~20GB)..."
sudo -u postgres psql -d $DB_NAME << 'EOF'
-- Insert 2 million large text documents (~20GB estimated)
INSERT INTO big_documents (doc_name, content, metadata)
SELECT
'doc_' || generate_series,
-- Each document: ~10KB of text content
repeat('Lorem ipsum dolor sit amet, consectetur adipiscing elit. ' ||
'Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' ||
'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris. ' ||
'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum. ' ||
'Excepteur sint occaecat cupidatat non proident, sunt in culpa qui. ' ||
'Nulla pariatur. Sed ut perspiciatis unde omnis iste natus error sit. ' ||
'At vero eos et accusamus et iusto odio dignissimos ducimus qui blanditiis. ' ||
'Document content section ' || generate_series || '. ', 50),
('{"doc_type": "test", "size_category": "large", "batch": ' || (generate_series / 10000) ||
', "tags": ["bulk_data", "test_doc", "large_dataset"]}')::jsonb
FROM generate_series(1, 2000000);
EOF
echo "✅ Large documents inserted"
# Check current size
CURRENT_SIZE=$(sudo -u postgres psql -d $DB_NAME -tAc "SELECT pg_database_size('$DB_NAME') / 1024 / 1024 / 1024.0;" 2>/dev/null)
echo "Current database size: ${CURRENT_SIZE}GB"
echo "Inserting high-volume metrics data (targeting additional ~15GB)..."
sudo -u postgres psql -d $DB_NAME << 'EOF'
-- Insert 5 million metrics records with large payloads (~15GB estimated)
INSERT INTO huge_metrics (timestamp, sensor_id, metric_type, value_data, binary_payload)
SELECT
NOW() - (generate_series * INTERVAL '1 second'),
generate_series % 10000, -- 10,000 different sensors
CASE (generate_series % 5)
WHEN 0 THEN 'temperature'
WHEN 1 THEN 'humidity'
WHEN 2 THEN 'pressure'
WHEN 3 THEN 'vibration'
ELSE 'electromagnetic'
END,
-- Large JSON-like text payload (~3KB each)
'{"readings": [' ||
'{"timestamp": "' || (NOW() - (generate_series * INTERVAL '1 second'))::text ||
'", "value": ' || (random() * 1000)::int ||
', "quality": "good", "metadata": "' || repeat('data_', 20) || '"},' ||
'{"timestamp": "' || (NOW() - ((generate_series + 1) * INTERVAL '1 second'))::text ||
'", "value": ' || (random() * 1000)::int ||
', "quality": "good", "metadata": "' || repeat('data_', 20) || '"},' ||
'{"timestamp": "' || (NOW() - ((generate_series + 2) * INTERVAL '1 second'))::text ||
'", "value": ' || (random() * 1000)::int ||
', "quality": "good", "metadata": "' || repeat('data_', 20) || '"}' ||
'], "sensor_info": "' || repeat('sensor_metadata_', 30) ||
'", "calibration": "' || repeat('calibration_data_', 25) || '"}',
-- Binary payload (~1KB each)
decode(encode(repeat('BINARY_SENSOR_DATA_CHUNK_', 25)::bytea, 'base64'), 'base64')
FROM generate_series(1, 5000000);
EOF
echo "✅ Metrics data inserted"
# Check size again
CURRENT_SIZE=$(sudo -u postgres psql -d $DB_NAME -tAc "SELECT pg_database_size('$DB_NAME') / 1024 / 1024 / 1024.0;" 2>/dev/null)
echo "Current database size: ${CURRENT_SIZE}GB"
echo "Inserting binary blob data to reach 50GB target..."
# Calculate remaining size needed
REMAINING_GB=$(echo "$TARGET_SIZE_GB - $CURRENT_SIZE" | bc -l 2>/dev/null || echo "15")
REMAINING_MB=$(echo "$REMAINING_GB * 1024" | bc -l 2>/dev/null || echo "15360")
echo "Need approximately ${REMAINING_GB}GB more data..."
# Insert binary blobs to fill remaining space
sudo -u postgres psql -d $DB_NAME << EOF
-- Insert large binary chunks to reach target size
-- Each blob will be approximately 5MB
INSERT INTO mega_blobs (chunk_id, blob_data)
SELECT
generate_series,
-- Generate ~5MB of binary data per row
decode(encode(repeat('LARGE_BINARY_CHUNK_FOR_TESTING_PURPOSES_', 100000)::bytea, 'base64'), 'base64')
FROM generate_series(1, ${REMAINING_MB%.*} / 5);
EOF
echo "✅ Binary blob data inserted"
echo ""
echo "3. Final optimization and statistics..."
# Analyze tables for accurate statistics
sudo -u postgres psql -d $DB_NAME << 'EOF'
-- Update table statistics
ANALYZE big_documents;
ANALYZE huge_metrics;
ANALYZE mega_blobs;
-- Vacuum to optimize storage
VACUUM ANALYZE;
EOF
echo ""
echo "4. Final database metrics..."
sudo -u postgres psql -d $DB_NAME << 'EOF'
-- Database size breakdown
SELECT
'TOTAL DATABASE SIZE' as component,
pg_size_pretty(pg_database_size(current_database())) as size,
ROUND(pg_database_size(current_database()) / 1024.0 / 1024.0 / 1024.0, 2) || ' GB' as size_gb
UNION ALL
SELECT
'big_documents table',
pg_size_pretty(pg_total_relation_size('big_documents')),
ROUND(pg_total_relation_size('big_documents') / 1024.0 / 1024.0 / 1024.0, 2) || ' GB'
UNION ALL
SELECT
'huge_metrics table',
pg_size_pretty(pg_total_relation_size('huge_metrics')),
ROUND(pg_total_relation_size('huge_metrics') / 1024.0 / 1024.0 / 1024.0, 2) || ' GB'
UNION ALL
SELECT
'mega_blobs table',
pg_size_pretty(pg_total_relation_size('mega_blobs')),
ROUND(pg_total_relation_size('mega_blobs') / 1024.0 / 1024.0 / 1024.0, 2) || ' GB';
-- Row counts
SELECT
'TABLE ROWS' as metric,
'' as value,
'' as extra
UNION ALL
SELECT
'big_documents',
COUNT(*)::text,
'rows'
FROM big_documents
UNION ALL
SELECT
'huge_metrics',
COUNT(*)::text,
'rows'
FROM huge_metrics
UNION ALL
SELECT
'mega_blobs',
COUNT(*)::text,
'rows'
FROM mega_blobs;
EOF
FINAL_SIZE=$(sudo -u postgres psql -d $DB_NAME -tAc "SELECT pg_size_pretty(pg_database_size('$DB_NAME'));" 2>/dev/null)
FINAL_GB=$(sudo -u postgres psql -d $DB_NAME -tAc "SELECT ROUND(pg_database_size('$DB_NAME') / 1024.0 / 1024.0 / 1024.0, 2);" 2>/dev/null)
echo ""
echo "=================================================="
echo "✅ LARGE DATABASE CREATION COMPLETED!"
echo "=================================================="
echo "Database Name: $DB_NAME"
echo "Final Size: $FINAL_SIZE (${FINAL_GB}GB)"
echo "Target: ${TARGET_SIZE_GB}GB"
echo "=================================================="
echo ""
echo "🧪 Ready for testing large database operations:"
echo ""
echo "# Test single database backup:"
echo "time sudo -u postgres ./dbbackup backup single $DB_NAME --confirm"
echo ""
echo "# Test cluster backup (includes this large DB):"
echo "time sudo -u postgres ./dbbackup backup cluster --confirm"
echo ""
echo "# Monitor backup progress:"
echo "watch 'ls -lah /backup/ 2>/dev/null || ls -lah ./*.dump* ./*.tar.gz 2>/dev/null'"
echo ""
echo "# Check database size anytime:"
echo "sudo -u postgres psql -d $DB_NAME -c \"SELECT pg_size_pretty(pg_database_size('$DB_NAME'));\""