Add a script for pruning old backup files

Adds a pruning script which is installed but not set to run by default.
Also adds for that script that can be run in a container that replicates the db container's conditions
This commit is contained in:
bcj 2021-03-08 23:16:34 -06:00
parent 015d45ef99
commit 84b525f83e
7 changed files with 638 additions and 1 deletions

View file

@ -209,6 +209,11 @@ Whenever a user interacts with a book, they are interacting with a specific edit
Bookwyrm's db service dumps a backup copy of its database to its `/backups` directory daily at midnight UTC.
Backups are named `backup__%Y-%m-%d.sql`.
The db service has an optional script for periodically pruning the backups directory so that all recent daily backups are kept, but for older backups, only weekly or monthly backups are kept.
To enable this script:
- Uncomment the final line in `postgres-docker/cronfile`
- rebuild your instance `docker-compose up --build`
You can copy backups from the backups volume to your host machine with `docker cp`:
- Run `docker-compose ps` to confirm the db service's full name (it's probably `bookwyrm_db_1`.
- Run `docker cp <container_name>:/backups <host machine path>

View file

@ -3,6 +3,7 @@ FROM postgres:latest
# crontab
RUN mkdir /backups
COPY ./backup.sh /backups
COPY ./weed.sh /backups
COPY ./cronfile /etc/cron.d/cronfile
RUN apt-get update && apt-get -y install cron
RUN chmod 0644 /etc/cron.d/cronfile

View file

@ -1,2 +1,5 @@
0 0 * * * /backups/backup.sh
# If uncommented, this script will weed the backups directory. It will keep the 14
# most-recent backups, then one backup/week for the next four backups, then one
# backup/month after that.
# 0 1 * * * /backups/weed.sh -d 14 -w 4 -m -1 /backups

View file

@ -0,0 +1,8 @@
FROM postgres:latest
RUN apt update && apt install -y shellcheck
COPY ./tests/testing-entrypoint.sh /testing-entrypoint.sh
RUN chmod u+rx,go=r /testing-entrypoint.sh
COPY ./weed.sh /weed.sh
RUN chmod u+rx,go=r /weed.sh

View file

@ -0,0 +1,9 @@
version: "3"
services:
weeding:
build:
# We need to build from the parent directory so we can access weed.sh
context: ..
dockerfile: ./tests/Dockerfile
entrypoint: /testing-entrypoint.sh

View file

@ -0,0 +1,426 @@
#!/usr/bin/env bash
# These tests are written to run in their own container, using the same image as the
# actual postgres service. To run: `docker-compose up --build`
set -euo pipefail
source /weed.sh
ERROR_COUNT=0
FAILURE_COUNT=0
# compare two sorted files
function compare_files {
local expected="$1"
local actual="$2"
declare -a missing
local missing_index=0
declare -a extra
local extra_index=0
old_ifs="$IFS"
IFS=$'\n'
for line in $(diff --suppress-common-lines "$expected" "$actual"); do
if [[ $line =~ ^\< ]]; then
missing[missing_index]=${line:1}
missing_index=$((missing_index + 1))
elif [[ $line =~ ^\> ]]; then
extra[extra_index]=${line:1}
extra_index=$((extra_index + 1))
fi
done
IFS="$old_ifs"
if [[ $((missing_index + extra_index)) -gt 0 ]]; then
echo 'fail'
if [[ missing_index -gt 0 ]]; then
echo -e "\\t$missing_index missing files:"
for index in $(seq 0 $((missing_index - 1))); do
echo -e "\\t\\t${missing[index]}"
done
fi
if [[ extra_index -gt 0 ]]; then
echo -e "\\t$extra_index extra files:"
for index in $(seq 0 $((extra_index - 1))); do
echo -e "\\t\\t${extra[index]}"
done
fi
FAILURE_COUNT=$((FAILURE_COUNT + 1))
return 1
fi
}
# This is a wrapper function that handles creating a directory with test files in it,
# running weed_directory (as the function, as a dry run, then finally actually-deleting
# files), marking the test as failed/errored as necessary, then cleaning up after
# itself. the first three arguments passed are the thresholds to pass into
# weed_directory. The remaining arguments are names of files to create for the test.
# Bash isn't great at passing arrays so instead of separately passing in a list of
# expected results, flag the files you expect to be deleted by prepending "DELETE:"
# to the path.
function perform_test {
echo "${FUNCNAME[1]}" | sed 's/^test_\(.*\)$/\1/' | tr '_\n' ' :'
echo -en '\t'
local daily_threshold="$1"
shift
local weekly_threshold="$1"
shift
local monthly_threshold="$1"
shift
# We might as well name the files we're using for running tests in as inflamatory a
# way as possible to increase the chances that bad filtering by weed_directory
# results in tests failing.
local expected="/testing/expected/backup__2020-02-02.sql"
local actual="/testing/backup__2020-02-02.sql.actual"
local remaining="/testing/remainbackup__2020-02-02.sql"
local temp="/testing/backup__2020-TE-MP.sql"
# create test files
mkdir -p /testing/expected
if [[ -e "$expected" ]]; then
rm "$expected"
fi
touch "$expected"
echo -e "$expected\\n$actual\\n$remaining\\n$temp" > "$remaining"
while [[ "$#" -gt 0 ]]; do
if [[ "$1" =~ ^DELETE: ]]; then
path="/testing/${1:7}"
echo "$path" >> "$expected"
else
path="/testing/$1"
echo "$path" >> "$remaining"
fi
directory=$(dirname "$path")
mkdir -p "$directory"
touch "$path"
shift
done
# We don't make any promise about the order files will be listed in by
# weed_directory (it is currently reverse-chronological). We should sort the output
# and the expected file instead of forcing tests to list files in that order (or
# causing tests to fail if weed_directory's order changes)
sort "$expected" > "$temp"
mv "$temp" "$expected"
sort "$remaining" > "$temp"
mv "$temp" "$remaining"
# Part one: call the function directly
set +e
(
weed_directory \
"/testing" \
"$daily_threshold" \
"$weekly_threshold" \
"$monthly_threshold" \
2> "$temp" \
| sort > "$actual"
)
local result="$?"
set -e
if [[ "$result" -ne 0 ]]; then
echo 'error'
ERROR_COUNT=$((ERROR_COUNT + 1))
if [[ -s "$temp" ]]; then
echo 'stderr:'
cat "$temp"
fi
else
set +e
compare_files "$expected" "$actual"
result="$?"
set -e
if [[ "$result" -eq 0 ]]; then
# Part two: as a script with the dry-run flag (-l)
set +e
(
"/weed.sh" \
"-d" "$daily_threshold" \
"-w" "$weekly_threshold" \
"-m" "$monthly_threshold" \
"-l" \
"/testing" \
2> "$temp" \
| sort > "$actual"
)
local result="$?"
set -e
if [[ "$result" -ne 0 ]]; then
echo 'error'
ERROR_COUNT=$((ERROR_COUNT + 1))
if [[ -s "$temp" ]]; then
echo 'stderr:'
cat "$temp"
fi
else
set +e
compare_files "$expected" "$actual"
result="$?"
set -e
if [[ "$result" -eq 0 ]]; then
# Part three: let's try actually deleting files
set +e
(
"/weed.sh" \
"-d" "$daily_threshold" \
"-w" "$weekly_threshold" \
"-m" "$monthly_threshold" \
"/testing" \
2> "$temp"
)
local result="$?"
set -e
if [[ "$result" -ne 0 ]]; then
echo 'error'
ERROR_COUNT=$((ERROR_COUNT + 1))
if [[ -s "$temp" ]]; then
echo 'stderr:'
cat "$temp"
fi
else
find /testing -type f | sort > "$actual"
set +e
compare_files "$remaining" "$actual"
result="$?"
set -e
if [[ "$result" -eq 0 ]]; then
echo 'pass'
elif [[ -s "$temp" ]]; then
echo 'stderr:'
cat "$temp"
fi
fi
elif [[ -s "$temp" ]]; then
echo 'stderr:'
cat "$temp"
fi
fi
elif [[ -s "$temp" ]]; then
echo 'stderr:'
cat "$temp"
fi
fi
rm -rf /testing
}
# actual tests
function test_shellcheck {
echo -en 'running shellcheck on scripts:\t'
shellcheck /weed.sh
# Test the tests too! Writing bash is hard
shellcheck -x /testing-entrypoint.sh
echo 'pass'
}
function test_empty_directory {
perform_test 1 2 3
}
function test_single_file {
perform_test 1 2 3 "backup__2021-02-02.sql"
}
function test_keep_everything {
perform_test -1 0 0 "backup__2021-02-02.sql" "backup__2021-02-01.sql" "backup__2021-01-31.sql"
}
function test_keep_one {
perform_test 1 0 0 "backup__2021-02-02.sql" "DELETE:backup__2021-02-01.sql" "DELETE:backup__2021-01-31.sql"
}
function test_weekly {
# weed.sh follows ISO 8601 and uses %W for day of week, so Monday is the first day
# of the week.
# backup__2021-03-08.sql: Monday (keep)
# backup__2021-03-07.sql: Sunday (keep)
# backup__2021-02-28.sql: Sunday (keep)
# backup__2021-02-22.sql: Monday (delete)
# backup__2021-02-20.sql: Saturday (keep)
# backup__2021-02-16.sql: Tuesday (delete)
# backup__2021-02-15.sql: Monday (delete)
# backup__2021-02-14.sql: Sunday (keep)
# backup__2020-02-14.sql: Sunday (same week of year) (keep)
perform_test 0 -1 0 \
"backup__2021-03-08.sql" \
"backup__2021-03-07.sql" \
"backup__2021-02-28.sql" \
"DELETE:backup__2021-02-22.sql" \
"backup__2021-02-20.sql" \
"DELETE:backup__2021-02-16.sql" \
"DELETE:backup__2021-02-15.sql" \
"backup__2021-02-14.sql" \
"backup__2020-02-14.sql"
}
function test_monthly {
perform_test 1 0 -1 \
"backup__2021-03-08.sql" \
"DELETE:backup__2021-03-07.sql" \
"backup__2021-02-28.sql" \
"DELETE:backup__2021-02-22.sql" \
"DELETE:backup__2021-02-20.sql" \
"DELETE:backup__2021-02-16.sql" \
"DELETE:backup__2021-02-15.sql" \
"DELETE:backup__2021-02-14.sql" \
"backup__2021-01-14.sql" \
"backup__2020-01-13.sql"
}
function test_annual {
perform_test 0 0 0 \
"backup__2021-03-08.sql" \
"DELETE:backup__2021-03-07.sql" \
"DELETE:backup__2021-02-28.sql" \
"DELETE:backup__2021-02-22.sql" \
"DELETE:backup__2021-02-20.sql" \
"DELETE:backup__2021-02-16.sql" \
"DELETE:backup__2021-02-15.sql" \
"DELETE:backup__2021-02-14.sql" \
"DELETE:backup__2021-01-14.sql" \
"backup__2020-01-13.sql" \
"backup__2019-12-31.sql" \
"DELETE:backup__2019-01-13.sql"
}
# Will not pass while maxdepth is set to 1.
function skip_test_sort_order {
perform_test 0 0 1 \
"a/backup__2021-03-08.sql" \
"DELETE:b/backup__2021-03-07.sql" \
"DELETE:a/backup__2021-02-28.sql" \
"DELETE:b/backup__2021-02-22.sql" \
"DELETE:a/backup__2021-02-20.sql" \
"DELETE:b/backup__2021-02-16.sql" \
"DELETE:a/backup__2021-02-15.sql" \
"DELETE:b/backup__2021-02-14.sql" \
"DELETE:a/backup__2021-01-14.sql" \
"b/backup__2020-01-13.sql" \
"a/backup__2019-12-31.sql" \
"DELETE:b/backup__2019-01-13.sql"
}
function test_ignore_subdirectories {
perform_test 0 0 0 "a/backup__2021-03-08.sql" "backup__2021-03-07.sql"
}
function test_standard {
perform_test 14 4 1 \
"backup__2021-03-08.sql" \
"backup__2021-03-07.sql" \
"backup__2021-03-06.sql" \
"backup__2021-03-05.sql" \
"backup__2021-03-04.sql" \
"backup__2021-03-03.sql" \
"backup__2021-03-02.sql" \
"backup__2021-03-01.sql" \
"backup__2021-02-28.sql" \
"backup__2021-02-27.sql" \
"backup__2021-02-26.sql" \
"backup__2021-02-25.sql" \
"backup__2021-02-24.sql" \
"backup__2021-02-23.sql" \
"DELETE:backup__2021-02-22.sql" \
"backup__2021-02-21.sql" \
"DELETE:backup__2021-02-20.sql" \
"DELETE:backup__2021-02-19.sql" \
"DELETE:backup__2021-02-18.sql" \
"DELETE:backup__2021-02-17.sql" \
"DELETE:backup__2021-02-16.sql" \
"DELETE:backup__2021-02-15.sql" \
"backup__2021-02-14.sql" \
"DELETE:backup__2021-02-13.sql" \
"DELETE:backup__2021-02-12.sql" \
"DELETE:backup__2021-02-11.sql" \
"DELETE:backup__2021-02-10.sql" \
"DELETE:backup__2021-02-09.sql" \
"DELETE:backup__2021-02-08.sql" \
"backup__2021-02-07.sql" \
"DELETE:backup__2021-02-06.sql" \
"DELETE:backup__2021-02-05.sql" \
"DELETE:backup__2021-02-04.sql" \
"DELETE:backup__2021-02-03.sql" \
"DELETE:backup__2021-02-02.sql" \
"DELETE:backup__2021-02-01.sql" \
"backup__2021-01-31.sql" \
"DELETE:backup__2021-01-30.sql" \
"DELETE:backup__2021-01-29.sql" \
"DELETE:backup__2021-01-28.sql" \
"DELETE:backup__2021-01-27.sql" \
"DELETE:backup__2021-01-26.sql" \
"DELETE:backup__2021-01-25.sql" \
"DELETE:backup__2021-01-24.sql" \
"DELETE:backup__2021-01-23.sql" \
"DELETE:backup__2021-01-22.sql" \
"DELETE:backup__2021-01-21.sql" \
"DELETE:backup__2021-01-20.sql" \
"DELETE:backup__2021-01-19.sql" \
"DELETE:backup__2021-01-18.sql" \
"DELETE:backup__2021-01-17.sql" \
"DELETE:backup__2021-01-16.sql" \
"DELETE:backup__2021-01-15.sql" \
"DELETE:backup__2021-01-14.sql" \
"DELETE:backup__2021-01-13.sql" \
"DELETE:backup__2021-01-12.sql" \
"DELETE:backup__2021-01-11.sql" \
"DELETE:backup__2021-01-10.sql" \
"DELETE:backup__2021-01-09.sql" \
"DELETE:backup__2021-01-08.sql" \
"DELETE:backup__2021-01-07.sql" \
"DELETE:backup__2021-01-06.sql" \
"DELETE:backup__2021-01-05.sql" \
"DELETE:backup__2021-01-04.sql" \
"DELETE:backup__2021-01-03.sql" \
"DELETE:backup__2021-01-02.sql" \
"DELETE:backup__2021-01-01.sql" \
"backup__2020-12-31.sql"
}
function tests {
# Run all functions named test_... in this file in definition order
count=0
while read -r test; do
eval "$test"
count=$((count + 1))
done < <(awk '$1 == "function" && $2 ~ "^test_" {print $2}' "${BASH_SOURCE[0]}")
echo "------------------"
echo "$((count - ERROR_COUNT - FAILURE_COUNT))/$count tests passed"
if [[ $((FAILURE_COUNT + ERROR_COUNT)) -gt 0 ]]; then
if [[ "$ERROR_COUNT" -gt 0 ]]; then
echo "$ERROR_COUNT tests errored"
fi
if [[ "$FAILURE_COUNT" -gt 0 ]]; then
echo "$FAILURE_COUNT tests failed"
fi
echo 'failure'
else
echo 'success'
fi
}
if [ "${BASH_SOURCE[0]}" -ef "$0" ]; then
trap 'echo -e "\\terror (in ${FUNCNAME[1]} ${BASH_SOURCE[1]}:${BASH_LINENO[1]})\naborting"' EXIT
tests
trap - EXIT
if [[ $((FAILURE_COUNT + ERROR_COUNT)) -gt 0 ]]; then
exit 1
fi
fi

185
postgres-docker/weed.sh Executable file
View file

@ -0,0 +1,185 @@
#!/usr/bin/env bash
# Weed old backups. See HELP for details.
# Tests for this script can be found in:
# bookwyrm/postgres-docker/tests/testing-entrypoint.sh
set -euo pipefail
DAILY_THRESHOLD=14
WEEKLY_THRESHOLD=4
MONTHLY_THRESHOLD=-1
HELP="\
NAME
weed -- remove old backups from the backups directory
SYNOPSIS
weed.sh [-d threshold] [-w threshold] [-m threshold] [-l] backup_directory
DESCRIPTION
Reduce the number of backups by only keeping a certain number of daily backups before \
reducing the frequency to weekly, monthly, and then finaly annually.
For each threshold, setting it to 0 will skip that frequency (e.g., setting weekly to \
0 will mean backups go directly from daily to monthly), and setting it to -1 will \
never reduce backups to a lower frequency (e.g., setting weekly to -1 will mean \
backups never are reduced to monthly backups).
-d threshold: Store this many daily backups before switching to weekly \
(default $DAILY_THRESHOLD)
-w threshold: Store this many weekly backups before switching to monthly \
(default $WEEKLY_THRESHOLD)
-m threshold: Store this many monthly backups before switching to annual \
(default $MONTHLY_THRESHOLD)
-l: Dry run. List the files that would be deleted.
"
# fail <message>
# Write a message to stderr then exit
function fail {
echo -e "weed: $1" >&2
exit 1
}
# parse_threshold <hopefully-a-number>
# Thresholds should be a non-negative number (or -1 for no threshold)
function parse_threshold {
if [[ ! $1 =~ ^-?[0-9]+$ || $1 -lt -1 ]]; then
fail "Invalid threshold: $1"
fi
echo "$1"
}
# weed_directory <directory> <daily_threshold> <weekly_threshold> <monthly_threshold>
# List files to be deleted
function weed_directory {
local directory=$1
local daily_threshold=$2
local weekly_threshold=$3
local monthly_threshold=$4
local count=0
local thresholds=("$daily_threshold" "$weekly_threshold" "$monthly_threshold" -1)
local date_formats=("%Y %m %d" "%Y %W" "%Y %m" "%Y")
local index=0
local last_date=""
local last_format=""
local date=""
# We would like to loop through all the backup files in the backup directory in
# reverse-chronological order. Bookwyrm backup files are named such that
# chronological and lexical order match. So we should be safe to find all backup
# files and reverse sort them. We should be terrified of deleting a backup an
# instance maintainer wants to keep, so we will be extra cautious. We're ignoring
# any subdirectories in case someone moves an important backup into a meaningfully
# named folder. We are also prepending the date to the path before sorting so that
# the ordering would be correct even if we were allowed to find backup files in
# subdirectories where chronological and lexical order don't match.
for date_file in $(
find "$directory" \
-maxdepth 1 \
-name 'backup__[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]\.sql' \
| sed 's/\(^.*backup__\([0-9-]*\)\.sql$\)/\2\1/' \
| sort --reverse
); do
date="${date_file:0:10}"
file="${date_file:10}"
date="${date_file:0:10}"
file="${date_file:10}"
# We can't fall off the end because we set annual backups to unlimited. It seems
# unlikely that instance maintainers would have enough concern about the space
# one backup/year takes to warrant supporting a cutoff.
while [[ ${thresholds[index]} -ne -1 && $count -ge ${thresholds[index]} ]]; do
index=$((index + 1))
last_format=""
count=0
done
if [[ -z "$last_date" ]]; then
count=$((count + 1))
last_date=$date
last_format=""
else
if [[ -z "$last_format" ]]; then
last_format=$(date --date="$last_date" +"${date_formats[index]}")
fi
format=$(date --date="$date" +"${date_formats[index]}")
if [[ "$format" == "$last_format" ]]; then
echo "$file"
else
count=$((count + 1))
last_date="$date"
last_format="$format"
fi
fi
done
}
function main(){
local daily_threshold=$DAILY_THRESHOLD
local weekly_threshold=$WEEKLY_THRESHOLD
local monthly_threshold=$MONTHLY_THRESHOLD
local dry_run=""
while getopts "hd:w:m:l" OPTION; do
case "$OPTION" in
h)
echo "$HELP";
exit
;;
d)
daily_threshold=$(parse_threshold "$OPTARG")
;;
w)
weekly_threshold=$(parse_threshold "$OPTARG")
;;
m)
monthly_threshold=$(parse_threshold "$OPTARG")
;;
l)
dry_run="true"
;;
:)
fail "Missing argument for '$OPTARG'. To see help run: weed.sh -h"
;;
?)
fail "Unknown option '$OPTION'. To see help run: weed.sh -h"
esac
done
shift "$((OPTIND - 1))"
if [[ $# -ne 1 ]]; then
fail "expected a single argument, directory"
fi
local count=0
for file in $(weed_directory "$1" "$daily_threshold" "$weekly_threshold" "$monthly_threshold"); do
count=$((count + 1))
if [[ -n "$dry_run" ]]; then
echo "$file"
else
echo "deleting $file" >&2
rm "$file"
fi
done
if [[ -n "$dry_run" ]]; then
optional_words="would be "
else
optional_words=""
fi
echo -e "$count files ${optional_words}deleted" >&2
}
if [ "${BASH_SOURCE[0]}" -ef "$0" ]; then
main "$@"
fi