Release v1.2.0: Fix streaming compression for large databases

This commit is contained in:
2025-11-11 15:21:36 +00:00
parent ed5c355385
commit 8005cfe943
9 changed files with 2011 additions and 15 deletions

243
create_large_test_db.sh Executable file
View File

@@ -0,0 +1,243 @@
#!/bin/bash
# Large Test Database Creator - 50GB with Blobs
# Creates a substantial database for testing backup/restore performance on large datasets
set -e
DB_NAME="testdb_large_50gb"
TARGET_SIZE_GB=50
CHUNK_SIZE_MB=10 # Size of each blob chunk in MB
TOTAL_CHUNKS=$((TARGET_SIZE_GB * 1024 / CHUNK_SIZE_MB)) # Total number of chunks needed
echo "=================================================="
echo "Creating Large Test Database: $DB_NAME"
echo "Target Size: ${TARGET_SIZE_GB}GB"
echo "Chunk Size: ${CHUNK_SIZE_MB}MB"
echo "Total Chunks: $TOTAL_CHUNKS"
echo "=================================================="
# Check available space
AVAILABLE_GB=$(df / | tail -1 | awk '{print int($4/1024/1024)}')
echo "Available disk space: ${AVAILABLE_GB}GB"
if [ $AVAILABLE_GB -lt $((TARGET_SIZE_GB + 10)) ]; then
echo "❌ ERROR: Insufficient disk space. Need at least $((TARGET_SIZE_GB + 10))GB"
exit 1
fi
echo "✅ Sufficient disk space available"
# Database connection settings
PGUSER="postgres"
PGHOST="localhost"
PGPORT="5432"
echo ""
echo "1. Creating database and schema..."
# Drop and recreate database
sudo -u postgres psql -c "DROP DATABASE IF EXISTS $DB_NAME;" 2>/dev/null || true
sudo -u postgres psql -c "CREATE DATABASE $DB_NAME;"
# Create tables with different data types
sudo -u postgres psql -d $DB_NAME << 'EOF'
-- Table for large binary objects (blobs)
CREATE TABLE large_blobs (
id SERIAL PRIMARY KEY,
name VARCHAR(255),
description TEXT,
blob_data BYTEA,
created_at TIMESTAMP DEFAULT NOW(),
size_mb INTEGER
);
-- Table for structured data with indexes
CREATE TABLE test_data (
id SERIAL PRIMARY KEY,
user_id INTEGER NOT NULL,
username VARCHAR(100) NOT NULL,
email VARCHAR(255) NOT NULL,
profile_data JSONB,
large_text TEXT,
random_number NUMERIC(15,2),
created_at TIMESTAMP DEFAULT NOW()
);
-- Table for time series data (lots of rows)
CREATE TABLE metrics (
id BIGSERIAL PRIMARY KEY,
timestamp TIMESTAMP NOT NULL,
metric_name VARCHAR(100) NOT NULL,
value DOUBLE PRECISION NOT NULL,
tags JSONB,
metadata TEXT
);
-- Indexes for performance
CREATE INDEX idx_test_data_user_id ON test_data(user_id);
CREATE INDEX idx_test_data_email ON test_data(email);
CREATE INDEX idx_test_data_created ON test_data(created_at);
CREATE INDEX idx_metrics_timestamp ON metrics(timestamp);
CREATE INDEX idx_metrics_name ON metrics(metric_name);
CREATE INDEX idx_metrics_tags ON metrics USING GIN(tags);
-- Large text table for document storage
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
title VARCHAR(500),
content TEXT,
document_data BYTEA,
tags TEXT[],
created_at TIMESTAMP DEFAULT NOW()
);
CREATE INDEX idx_documents_tags ON documents USING GIN(tags);
EOF
echo "✅ Database schema created"
echo ""
echo "2. Generating large blob data..."
# Function to generate random data
generate_blob_data() {
local chunk_num=$1
local size_mb=$2
# Generate random binary data using dd and base64
dd if=/dev/urandom bs=1M count=$size_mb 2>/dev/null | base64 -w 0
}
echo "Inserting $TOTAL_CHUNKS blob chunks of ${CHUNK_SIZE_MB}MB each..."
# Insert blob data in chunks
for i in $(seq 1 $TOTAL_CHUNKS); do
echo -n " Progress: $i/$TOTAL_CHUNKS ($(($i * 100 / $TOTAL_CHUNKS))%) - "
# Generate blob data
BLOB_DATA=$(generate_blob_data $i $CHUNK_SIZE_MB)
# Insert into database
sudo -u postgres psql -d $DB_NAME -c "
INSERT INTO large_blobs (name, description, blob_data, size_mb)
VALUES (
'blob_chunk_$i',
'Large binary data chunk $i of $TOTAL_CHUNKS for testing backup/restore performance',
decode('$BLOB_DATA', 'base64'),
$CHUNK_SIZE_MB
);" > /dev/null
echo "✅ Chunk $i inserted"
# Every 10 chunks, show current database size
if [ $((i % 10)) -eq 0 ]; then
CURRENT_SIZE=$(sudo -u postgres psql -d $DB_NAME -tAc "
SELECT pg_size_pretty(pg_database_size('$DB_NAME'));" 2>/dev/null || echo "Unknown")
echo " Current database size: $CURRENT_SIZE"
fi
done
echo ""
echo "3. Generating structured test data..."
# Insert large amounts of structured data
sudo -u postgres psql -d $DB_NAME << 'EOF'
-- Insert 1 million rows of test data (will add significant size)
INSERT INTO test_data (user_id, username, email, profile_data, large_text, random_number)
SELECT
generate_series % 100000 as user_id,
'user_' || generate_series as username,
'user_' || generate_series || '@example.com' as email,
('{"preferences": {"theme": "dark", "language": "en", "notifications": true}, "metadata": {"last_login": "2024-01-01", "session_count": ' || (generate_series % 1000) || ', "data": "' || repeat('x', 100) || '"}}')::jsonb as profile_data,
repeat('This is large text content for testing. ', 50) || ' Row: ' || generate_series as large_text,
random() * 1000000 as random_number
FROM generate_series(1, 1000000);
-- Insert time series data (2 million rows)
INSERT INTO metrics (timestamp, metric_name, value, tags, metadata)
SELECT
NOW() - (generate_series || ' minutes')::interval as timestamp,
CASE (generate_series % 5)
WHEN 0 THEN 'cpu_usage'
WHEN 1 THEN 'memory_usage'
WHEN 2 THEN 'disk_io'
WHEN 3 THEN 'network_tx'
ELSE 'network_rx'
END as metric_name,
random() * 100 as value,
('{"host": "server_' || (generate_series % 100) || '", "env": "' ||
CASE (generate_series % 3) WHEN 0 THEN 'prod' WHEN 1 THEN 'staging' ELSE 'dev' END ||
'", "region": "us-' || CASE (generate_series % 2) WHEN 0 THEN 'east' ELSE 'west' END || '"}')::jsonb as tags,
'Generated metric data for testing - ' || repeat('metadata_', 10) as metadata
FROM generate_series(1, 2000000);
-- Insert document data with embedded binary content
INSERT INTO documents (title, content, document_data, tags)
SELECT
'Document ' || generate_series as title,
repeat('This is document content with lots of text to increase database size. ', 100) ||
' Document ID: ' || generate_series || '. ' ||
repeat('Additional content to make documents larger. ', 20) as content,
decode(encode(('Binary document data for doc ' || generate_series || ': ' || repeat('BINARY_DATA_', 1000))::bytea, 'base64'), 'base64') as document_data,
ARRAY['tag_' || (generate_series % 10), 'category_' || (generate_series % 5), 'type_document'] as tags
FROM generate_series(1, 100000);
EOF
echo "✅ Structured data inserted"
echo ""
echo "4. Final database statistics..."
# Get final database size and statistics
sudo -u postgres psql -d $DB_NAME << 'EOF'
SELECT
'Database Size' as metric,
pg_size_pretty(pg_database_size(current_database())) as value
UNION ALL
SELECT
'Table: large_blobs',
pg_size_pretty(pg_total_relation_size('large_blobs'))
UNION ALL
SELECT
'Table: test_data',
pg_size_pretty(pg_total_relation_size('test_data'))
UNION ALL
SELECT
'Table: metrics',
pg_size_pretty(pg_total_relation_size('metrics'))
UNION ALL
SELECT
'Table: documents',
pg_size_pretty(pg_total_relation_size('documents'));
-- Row counts
SELECT 'large_blobs rows' as table_name, COUNT(*) as row_count FROM large_blobs
UNION ALL
SELECT 'test_data rows', COUNT(*) FROM test_data
UNION ALL
SELECT 'metrics rows', COUNT(*) FROM metrics
UNION ALL
SELECT 'documents rows', COUNT(*) FROM documents;
EOF
echo ""
echo "=================================================="
echo "✅ Large test database creation completed!"
echo "Database: $DB_NAME"
echo "=================================================="
# Show final size
FINAL_SIZE=$(sudo -u postgres psql -d $DB_NAME -tAc "SELECT pg_size_pretty(pg_database_size('$DB_NAME'));" 2>/dev/null)
echo "Final database size: $FINAL_SIZE"
echo ""
echo "You can now test backup/restore operations:"
echo " # Backup the large database"
echo " sudo -u postgres ./dbbackup backup single $DB_NAME"
echo ""
echo " # Backup entire cluster (including this large DB)"
echo " sudo -u postgres ./dbbackup backup cluster"
echo ""
echo " # Check database size anytime:"
echo " sudo -u postgres psql -d $DB_NAME -c \"SELECT pg_size_pretty(pg_database_size('$DB_NAME'));\""