If you guys are using Nagios, I've create a small check script (check_storagereplication) to get alerted on replication errors. It's not perfect but it works fine for us in prod.
Code:
#!/bin/bash
# Script to check Proxmox storage replication
# ExitCode:
# 0 = Ok
# 1 = Warning
# 2 = Critical
# 4 = Ok (No replicatons configured)
RESULTS=($(/usr/bin/pvesr status | awk 'NR>1 {print $7}'))
STATE=($(/usr/bin/pvesr status | awk 'NR>1 {print $8}'))
EXITCODE=0
for i in "${RESULTS[@]}"
do
if [ $i -gt 0 ] && [ $i -le 10 ]
then
EXITCODE=1
elif [ $i -gt 10 ]
then
EXITCODE=2
else
EXITCODE=2
fi
done
for i in "${STATE[@]}"
do
if [ $i == "OK" ] || [ $i == "SYNCING" ]
then
EXITCODE=0
else
EXITCODE=2
fi
done
if [ -z $RESULTS ] && [ -z $STATE ]
then
EXITCODE=4
fi
if [ $EXITCODE -eq 2 ]
then
echo "CRITICAL: Some replication jobs failed !"
exit 2
elif [ $EXITCODE -eq 1 ]
then
echo "WARNING: There is some errors with some replication jobs"
exit 1
elif [ $EXITCODE -eq 4 ]
then
echo "OK: No replication jobs configured"
exit 0
elif [ $EXITCODE -eq 0 ]
then
echo "OK: All replication jobs working as intented"
exit 0
fi