#!/bin/bash
#############################################################################
# Script: couchbase_intelligent_monitor.sh
# Purpose: Intelligent Couchbase monitoring with auto-healing and recommendations
# Version: 1.0
# Author: Raj
# Date: June 10th 2014
#
# Features:
# - Comprehensive health monitoring
# - Auto-healing capabilities
# - Intelligent recommendations
# - Performance metrics collection
# - Predictive analysis
# - Alert management
#############################################################################
# Configuration
COUCHBASE_HOST="${COUCHBASE_HOST:-localhost}"
COUCHBASE_PORT="${COUCHBASE_PORT:-8091}"
COUCHBASE_USER="${COUCHBASE_USER:-Administrator}"
COUCHBASE_PASSWORD="${COUCHBASE_PASSWORD:-password}"
CLUSTER_NAME="${CLUSTER_NAME:-couchbase-cluster}"
# Monitoring Configuration
MONITOR_INTERVAL="${MONITOR_INTERVAL:-60}" # seconds
LOG_DIR="/var/log/couchbase-monitor"
LOG_FILE="${LOG_DIR}/monitor_$(date +%Y%m%d).log"
ALERT_LOG="${LOG_DIR}/alerts_$(date +%Y%m%d).log"
METRICS_FILE="${LOG_DIR}/metrics_$(date +%Y%m%d_%H%M%S).json"
REPORT_FILE="${LOG_DIR}/health_report_$(date +%Y%m%d_%H%M%S).html"
# Thresholds
CPU_THRESHOLD=80
MEMORY_THRESHOLD=85
DISK_THRESHOLD=80
SWAP_THRESHOLD=20
CACHE_MISS_THRESHOLD=10
REBALANCE_TIMEOUT=3600
CONNECTION_THRESHOLD=1000
QUERY_LATENCY_THRESHOLD=1000 # ms
INDEX_FRAGMENTATION_THRESHOLD=30
# Alert Configuration
ENABLE_EMAIL_ALERTS="${ENABLE_EMAIL_ALERTS:-false}"
ALERT_EMAIL="${ALERT_EMAIL:-admin@company.com}"
ENABLE_SLACK_ALERTS="${ENABLE_SLACK_ALERTS:-false}"
SLACK_WEBHOOK="${SLACK_WEBHOOK:-}"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
NC='\033[0m'
# Create log directory
mkdir -p "$LOG_DIR"
# Logging function
log_message() {
local level=$1
shift
local message="$@"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] [$level] $message" | tee -a "$LOG_FILE"
if [ "$level" = "ERROR" ] || [ "$level" = "CRITICAL" ]; then
echo "[$timestamp] [$level] $message" >> "$ALERT_LOG"
send_alert "$level" "$message"
fi
}
# Print colored output
print_color() {
local color=$1
shift
echo -e "${color}$@${NC}"
}
# Send alerts
send_alert() {
local level=$1
local message=$2
# Email alert
if [ "$ENABLE_EMAIL_ALERTS" = "true" ] && [ -n "$ALERT_EMAIL" ]; then
echo "$message" | mail -s "[$level] Couchbase Alert - $CLUSTER_NAME" "$ALERT_EMAIL"
fi
# Slack alert
if [ "$ENABLE_SLACK_ALERTS" = "true" ] && [ -n "$SLACK_WEBHOOK" ]; then
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"[$level] $CLUSTER_NAME: $message\"}" \
"$SLACK_WEBHOOK" 2>/dev/null
fi
}
# Execute Couchbase REST API call
cb_api_call() {
local endpoint=$1
local method=${2:-GET}
local data=$3
local url="http://${COUCHBASE_HOST}:${COUCHBASE_PORT}${endpoint}"
if [ "$method" = "GET" ]; then
curl -s -u "${COUCHBASE_USER}:${COUCHBASE_PASSWORD}" "$url"
elif [ "$method" = "POST" ]; then
curl -s -X POST -u "${COUCHBASE_USER}:${COUCHBASE_PASSWORD}" \
-d "$data" "$url"
fi
}
# Execute Couchbase CLI command
cb_cli() {
local command=$@
/opt/couchbase/bin/couchbase-cli $command \
-c "${COUCHBASE_HOST}:${COUCHBASE_PORT}" \
-u "$COUCHBASE_USER" \
-p "$COUCHBASE_PASSWORD"
}
# Check cluster health
check_cluster_health() {
log_message "INFO" "Checking cluster health..."
local cluster_info=$(cb_api_call "/pools/default")
if [ -z "$cluster_info" ]; then
log_message "CRITICAL" "Cannot connect to Couchbase cluster"
return 1
fi
# Parse cluster status
local balanced=$(echo "$cluster_info" | jq -r '.balanced')
local rebalance_status=$(echo "$cluster_info" | jq -r '.rebalanceStatus')
local nodes_count=$(echo "$cluster_info" | jq '.nodes | length')
local healthy_nodes=$(echo "$cluster_info" | jq '[.nodes[] | select(.status == "healthy")] | length')
# Check if cluster is balanced
if [ "$balanced" != "true" ]; then
log_message "WARNING" "Cluster is not balanced"
recommend_action "REBALANCE" "Run cluster rebalance to distribute data evenly"
fi
# Check node health
if [ "$healthy_nodes" -lt "$nodes_count" ]; then
local unhealthy=$((nodes_count - healthy_nodes))
log_message "ERROR" "Found $unhealthy unhealthy nodes in cluster"
analyze_unhealthy_nodes
fi
# Check rebalance status
if [ "$rebalance_status" = "running" ]; then
log_message "INFO" "Rebalance in progress"
monitor_rebalance
fi
echo "$cluster_info"
}
# Analyze unhealthy nodes
analyze_unhealthy_nodes() {
local nodes=$(cb_api_call "/pools/default" | jq -r '.nodes[]')
echo "$nodes" | jq -r '. | select(.status != "healthy") | .hostname' | while read node; do
log_message "WARNING" "Unhealthy node detected: $node"
# Check if node is reachable
if ! ping -c 1 -W 2 "$node" > /dev/null 2>&1; then
log_message "ERROR" "Node $node is unreachable"
recommend_action "NODE_DOWN" "Check network connectivity or node status for $node"
else
# Try to diagnose the issue
diagnose_node_issue "$node"
fi
done
}
# Diagnose node issues
diagnose_node_issue() {
local node=$1
log_message "INFO" "Diagnosing issues on node $node"
# Check services
local services=$(cb_api_call "/pools/default" | jq -r ".nodes[] | select(.hostname == \"$node\") | .services[]")
for service in $services; do
case $service in
"kv")
check_data_service "$node"
;;
"index")
check_index_service "$node"
;;
"n1ql")
check_query_service "$node"
;;
"fts")
check_search_service "$node"
;;
esac
done
}
# Check memory usage
check_memory_usage() {
log_message "INFO" "Checking memory usage..."
local nodes=$(cb_api_call "/pools/default/buckets/@system/nodes")
echo "$nodes" | jq -r '.servers[]' | while read -r node_data; do
local hostname=$(echo "$node_data" | jq -r '.hostname')
local mem_used=$(echo "$node_data" | jq -r '.systemStats.mem_actual_used')
local mem_total=$(echo "$node_data" | jq -r '.systemStats.mem_total')
local mem_percent=$(echo "scale=2; ($mem_used / $mem_total) * 100" | bc)
if (( $(echo "$mem_percent > $MEMORY_THRESHOLD" | bc -l) )); then
log_message "WARNING" "High memory usage on $hostname: ${mem_percent}%"
auto_heal_memory "$hostname"
fi
done
}
# Auto-heal memory issues
auto_heal_memory() {
local node=$1
log_message "INFO" "Attempting to auto-heal memory issues on $node"
# 1. Compact buckets
local buckets=$(cb_api_call "/pools/default/buckets" | jq -r '.[].name')
for bucket in $buckets; do
log_message "INFO" "Compacting bucket $bucket"
cb_api_call "/pools/default/buckets/$bucket/controller/compactBucket" "POST"
done
# 2. Clear expired documents
log_message "INFO" "Clearing expired documents"
cb_cli bucket-compact --bucket all
# 3. Adjust cache if needed
recommend_action "MEMORY" "Consider adjusting bucket memory quotas or adding more nodes"
}
# Check disk usage
check_disk_usage() {
log_message "INFO" "Checking disk usage..."
local nodes=$(cb_api_call "/pools/default")
echo "$nodes" | jq -r '.nodes[]' | while read -r node_data; do
local hostname=$(echo "$node_data" | jq -r '.hostname')
local disk_used=$(echo "$node_data" | jq -r '.systemStats.disk_used')
local disk_total=$(echo "$node_data" | jq -r '.systemStats.disk_total')
if [ "$disk_total" != "null" ] && [ "$disk_total" -gt 0 ]; then
local disk_percent=$(echo "scale=2; ($disk_used / $disk_total) * 100" | bc)
if (( $(echo "$disk_percent > $DISK_THRESHOLD" | bc -l) )); then
log_message "WARNING" "High disk usage on $hostname: ${disk_percent}%"
auto_heal_disk "$hostname"
fi
fi
done
}
# Auto-heal disk issues
auto_heal_disk() {
local node=$1
log_message "INFO" "Attempting to auto-heal disk issues on $node"
# 1. Trigger compaction
log_message "INFO" "Triggering auto-compaction"
cb_api_call "/controller/setAutoCompaction" "POST" \
"databaseFragmentationThreshold[percentage]=20&viewFragmentationThreshold[percentage]=20"
# 2. Clean up old logs
log_message "INFO" "Cleaning up old logs"
find /opt/couchbase/var/lib/couchbase/logs -type f -mtime +7 -delete 2>/dev/null
# 3. Recommend further actions
recommend_action "DISK" "Consider: 1) Increasing disk space, 2) Adjusting TTL values, 3) Archiving old data"
}
# Check bucket performance
check_bucket_performance() {
log_message "INFO" "Checking bucket performance..."
local buckets=$(cb_api_call "/pools/default/buckets")
echo "$buckets" | jq -r '.[].name' | while read bucket; do
local stats=$(cb_api_call "/pools/default/buckets/$bucket/stats")
# Check cache miss ratio
local cache_miss_rate=$(echo "$stats" | jq -r '.op.samples.ep_cache_miss_rate[-1]')
if (( $(echo "$cache_miss_rate > $CACHE_MISS_THRESHOLD" | bc -l) )); then
log_message "WARNING" "High cache miss rate in bucket $bucket: ${cache_miss_rate}%"
optimize_bucket_cache "$bucket"
fi
# Check disk queue
local disk_queue=$(echo "$stats" | jq -r '.op.samples.ep_queue_size[-1]')
if [ "$disk_queue" -gt 1000000 ]; then
log_message "WARNING" "Large disk queue in bucket $bucket: $disk_queue items"
recommend_action "PERFORMANCE" "Bucket $bucket has disk write backlog. Consider increasing writers or I/O capacity"
fi
# Check operation latency
check_operation_latency "$bucket" "$stats"
done
}
# Check operation latency
check_operation_latency() {
local bucket=$1
local stats=$2
# Get operation timings
local get_latency=$(echo "$stats" | jq -r '.op.samples.get_cmd_latency[-1]' 2>/dev/null)
local set_latency=$(echo "$stats" | jq -r '.op.samples.set_cmd_latency[-1]' 2>/dev/null)
if [ -n "$get_latency" ] && [ "$get_latency" != "null" ]; then
if (( $(echo "$get_latency > 1000" | bc -l) )); then
log_message "WARNING" "High GET latency in bucket $bucket: ${get_latency}μs"
recommend_action "LATENCY" "Consider: 1) Adding more nodes, 2) Optimizing queries, 3) Adding indexes"
fi
fi
}
# Optimize bucket cache
optimize_bucket_cache() {
local bucket=$1
log_message "INFO" "Optimizing cache for bucket $bucket"
# Get current bucket configuration
local bucket_info=$(cb_api_call "/pools/default/buckets/$bucket")
local ram_quota=$(echo "$bucket_info" | jq -r '.quota.ram')
local item_count=$(echo "$bucket_info" | jq -r '.basicStats.itemCount')
# Calculate optimal memory
local bytes_per_item=500 # Approximate
local optimal_ram=$((item_count * bytes_per_item / 1048576)) # Convert to MB
if [ "$optimal_ram" -gt "$ram_quota" ]; then
log_message "INFO" "Bucket $bucket needs more RAM. Current: ${ram_quota}MB, Recommended: ${optimal_ram}MB"
recommend_action "CACHE" "Increase RAM quota for bucket $bucket to ${optimal_ram}MB"
fi
}
# Check indexes
check_indexes() {
log_message "INFO" "Checking indexes..."
# Get index status
local indexes=$(cb_api_call "/indexStatus")
echo "$indexes" | jq -r '.indexes[]' | while read -r index_data; do
local index_name=$(echo "$index_data" | jq -r '.index')
local status=$(echo "$index_data" | jq -r '.status')
local progress=$(echo "$index_data" | jq -r '.progress')
if [ "$status" != "Ready" ]; then
log_message "WARNING" "Index $index_name is not ready: $status ($progress%)"
if [ "$status" = "Error" ]; then
rebuild_index "$index_name"
fi
fi
# Check fragmentation
check_index_fragmentation "$index_name"
done
}
# Check index fragmentation
check_index_fragmentation() {
local index=$1
local stats=$(cb_api_call "/pools/default/buckets/@index/stats")
local fragmentation=$(echo "$stats" | jq -r ".op.samples.index_fragmentation_${index}[-1]" 2>/dev/null)
if [ -n "$fragmentation" ] && [ "$fragmentation" != "null" ]; then
if (( $(echo "$fragmentation > $INDEX_FRAGMENTATION_THRESHOLD" | bc -l) )); then
log_message "WARNING" "High fragmentation in index $index: ${fragmentation}%"
rebuild_index "$index"
fi
fi
}
# Rebuild index
rebuild_index() {
local index=$1
log_message "INFO" "Rebuilding index $index"
# This would typically use N1QL
local query="ALTER INDEX \`$index\` REBUILD"
cb_api_call "/query/service" "POST" "statement=$query"
recommend_action "INDEX" "Index $index is being rebuilt due to issues"
}
# Check query service
check_query_service() {
local node=$1
log_message "INFO" "Checking query service on $node"
# Check active queries
local active_requests=$(cb_api_call "/pools/default/tasks" | jq -r '.tasks[] | select(.type == "n1ql")')
if [ -n "$active_requests" ]; then
echo "$active_requests" | while read -r request; do
local duration=$(echo "$request" | jq -r '.runtime')
if [ "$duration" -gt 60000 ]; then # More than 60 seconds
log_message "WARNING" "Long-running query detected: ${duration}ms"
analyze_slow_query "$request"
fi
done
fi
}
# Analyze slow queries
analyze_slow_query() {
local query_info=$1
log_message "INFO" "Analyzing slow query"
# Get query plan
local statement=$(echo "$query_info" | jq -r '.statement')
# Check for missing indexes
if echo "$statement" | grep -qi "WHERE\|JOIN" && ! echo "$statement" | grep -qi "USE INDEX"; then
recommend_action "QUERY" "Query may benefit from indexes. Review execution plan."
fi
# Check for full collection scans
if echo "$statement" | grep -qi "SELECT \*"; then
recommend_action "QUERY" "Avoid SELECT * queries. Specify required fields only."
fi
}
# Check XDCR (Cross Data Center Replication)
check_xdcr() {
log_message "INFO" "Checking XDCR status..."
local xdcr_tasks=$(cb_api_call "/pools/default/remoteClusters")
if [ "$(echo "$xdcr_tasks" | jq '. | length')" -gt 0 ]; then
echo "$xdcr_tasks" | jq -r '.[]' | while read -r remote; do
local name=$(echo "$remote" | jq -r '.name')
local hostname=$(echo "$remote" | jq -r '.hostname')
# Check connectivity
if ! nc -z "$hostname" 8091 2>/dev/null; then
log_message "ERROR" "XDCR remote cluster $name unreachable at $hostname"
recommend_action "XDCR" "Check network connectivity to remote cluster $name"
fi
# Check replication status
check_xdcr_replication "$name"
done
fi
}
# Check XDCR replication status
check_xdcr_replication() {
local remote=$1
local replications=$(cb_api_call "/pools/default/tasks" | jq -r '.tasks[] | select(.type == "xdcr")')
echo "$replications" | while read -r repl; do
local status=$(echo "$repl" | jq -r '.status')
local errors=$(echo "$repl" | jq -r '.errors')
if [ "$status" = "error" ] || [ "$errors" -gt 0 ]; then
log_message "ERROR" "XDCR replication to $remote has errors"
auto_heal_xdcr "$remote"
fi
done
}
# Auto-heal XDCR issues
auto_heal_xdcr() {
local remote=$1
log_message "INFO" "Attempting to auto-heal XDCR to $remote"
# Restart XDCR replication
cb_cli xdcr-replicate --pause --xdcr-replicator="$remote"
sleep 5
cb_cli xdcr-replicate --resume --xdcr-replicator="$remote"
recommend_action "XDCR" "XDCR replication to $remote was restarted. Monitor for improvements."
}
# Monitor rebalance
monitor_rebalance() {
local start_time=$(date +%s)
while true; do
local rebalance_status=$(cb_api_call "/pools/default/rebalanceProgress")
local status=$(echo "$rebalance_status" | jq -r '.status')
if [ "$status" = "none" ]; then
log_message "INFO" "Rebalance completed successfully"
break
fi
local progress=$(echo "$rebalance_status" | jq -r '.ns_1@node1.progress')
log_message "INFO" "Rebalance progress: ${progress}%"
# Check timeout
local current_time=$(date +%s)
local elapsed=$((current_time - start_time))
if [ "$elapsed" -gt "$REBALANCE_TIMEOUT" ]; then
log_message "ERROR" "Rebalance timeout after ${elapsed} seconds"
recommend_action "REBALANCE" "Rebalance is taking too long. Check cluster resources."
break
fi
sleep 30
done
}
# Check backup status
check_backup_status() {
log_message "INFO" "Checking backup status..."
# Check for backup repository
if [ -d "/opt/couchbase/backup" ]; then
local latest_backup=$(find /opt/couchbase/backup -type d -name "20*" | sort -r | head -1)
if [ -n "$latest_backup" ]; then
local backup_age=$(find "$latest_backup" -maxdepth 0 -mmin +1440 2>/dev/null)
if [ -n "$backup_age" ]; then
log_message "WARNING" "Latest backup is more than 24 hours old"
recommend_action "BACKUP" "Schedule regular backups to prevent data loss"
fi
else
log_message "WARNING" "No backups found"
recommend_action "BACKUP" "Configure and schedule regular backups immediately"
fi
fi
}
# Recommend actions based on issues
recommend_action() {
local category=$1
local recommendation=$2
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [$category] RECOMMENDATION: $recommendation" >> "${LOG_DIR}/recommendations.log"
print_color "$YELLOW" "📌 RECOMMENDATION [$category]: $recommendation"
# Add to report
echo "<div class='recommendation $category'>$recommendation</div>" >> "$REPORT_FILE"
}
# Generate health report
generate_health_report() {
log_message "INFO" "Generating health report..."
cat > "$REPORT_FILE" <<EOF
<!DOCTYPE html>
<html>
<head>
<title>Couchbase Health Report - $(date '+%Y-%m-%d %H:%M:%S')</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
h1 { color: #333; }
.status-ok { color: green; }
.status-warning { color: orange; }
.status-error { color: red; }
.metric { margin: 10px 0; padding: 10px; border-left: 3px solid #ddd; }
.recommendation { background: #fffbdd; padding: 10px; margin: 10px 0; border-left: 3px solid #ffa500; }
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background: #f2f2f2; }
</style>
</head>
<body>
<h1>Couchbase Cluster Health Report</h1>
<p>Generated: $(date '+%Y-%m-%d %H:%M:%S')</p>
<p>Cluster: $CLUSTER_NAME</p>
EOF
# Add cluster overview
local cluster_info=$(cb_api_call "/pools/default")
local node_count=$(echo "$cluster_info" | jq '.nodes | length')
local balanced=$(echo "$cluster_info" | jq -r '.balanced')
cat >> "$REPORT_FILE" <<EOF
<h2>Cluster Overview</h2>
<table>
<tr><th>Metric</th><th>Value</th><th>Status</th></tr>
<tr><td>Total Nodes</td><td>$node_count</td><td class="status-ok">OK</td></tr>
<tr><td>Balanced</td><td>$balanced</td><td class="$([ "$balanced" = "true" ] && echo "status-ok" || echo "status-warning")">$([ "$balanced" = "true" ] && echo "OK" || echo "NEEDS REBALANCE")</td></tr>
</table>
EOF
# Add node details
cat >> "$REPORT_FILE" <<EOF
<h2>Node Status</h2>
<table>
<tr><th>Hostname</th><th>Status</th><th>Services</th><th>CPU %</th><th>Memory %</th><th>Disk %</th></tr>
EOF
echo "$cluster_info" | jq -r '.nodes[]' | while read -r node_data; do
local hostname=$(echo "$node_data" | jq -r '.hostname')
local status=$(echo "$node_data" | jq -r '.status')
local services=$(echo "$node_data" | jq -r '.services | join(", ")')
local cpu=$(echo "$node_data" | jq -r '.systemStats.cpu_utilization_rate // 0')
local mem_used=$(echo "$node_data" | jq -r '.systemStats.mem_actual_used // 0')
local mem_total=$(echo "$node_data" | jq -r '.systemStats.mem_total // 1')
local mem_percent=$(echo "scale=2; ($mem_used / $mem_total) * 100" | bc)
cat >> "$REPORT_FILE" <<EOF
<tr>
<td>$hostname</td>
<td class="$([ "$status" = "healthy" ] && echo "status-ok" || echo "status-error")">$status</td>
<td>$services</td>
<td>$cpu%</td>
<td>$mem_percent%</td>
<td>N/A</td>
</tr>
EOF
done
cat >> "$REPORT_FILE" <<EOF
</table>
<h2>Recommendations</h2>
$(cat "${LOG_DIR}/recommendations.log" 2>/dev/null | tail -20 | sed 's/^/<p>/;s/$/<\/p>/')
</body>
</html>
EOF
log_message "INFO" "Health report generated: $REPORT_FILE"
}
# Main monitoring loop
main_monitoring_loop() {
print_color "$GREEN" "=========================================="
print_color "$GREEN" "Couchbase Intelligent Monitor v1.0"
print_color "$GREEN" "Cluster: $CLUSTER_NAME"
print_color "$GREEN" "=========================================="
while true; do
print_color "$BLUE" "\n[$(date '+%Y-%m-%d %H:%M:%S')] Starting monitoring cycle..."
# Core health checks
check_cluster_health
check_memory_usage
check_disk_usage
check_bucket_performance
check_indexes
check_query_service
check_xdcr
check_backup_status
# Generate report every hour
if [ $(($(date +%s) % 3600)) -lt "$MONITOR_INTERVAL" ]; then
generate_health_report
fi
print_color "$GREEN" "Monitoring cycle complete. Next check in ${MONITOR_INTERVAL} seconds..."
sleep "$MONITOR_INTERVAL"
done
}
# Signal handlers
trap 'log_message "INFO" "Monitor stopped by user"; exit 0' SIGINT SIGTERM
# Check prerequisites
check_prerequisites() {
# Check for required tools
for tool in curl jq bc nc; do
if ! command -v $tool &> /dev/null; then
print_color "$RED" "ERROR: Required tool '$tool' is not installed"
exit 1
fi
done
# Check Couchbase CLI
if [ ! -f "/opt/couchbase/bin/couchbase-cli" ]; then
print_color "$YELLOW" "WARNING: Couchbase CLI not found at default location"
fi
# Test connection
if ! cb_api_call "/pools" > /dev/null 2>&1; then
print_color "$RED" "ERROR: Cannot connect to Couchbase at ${COUCHBASE_HOST}:${COUCHBASE_PORT}"
print_color "$YELLOW" "Please check connection settings and credentials"
exit 1
fi
}
# Start monitoring
log_message "INFO" "Starting Couchbase Intelligent Monitor"
check_prerequisites
main_monitoring_loop