#!/bin/sh -efu
#
# Copyright (C) 2023  BaseALT /basealt.ru/
#
# This file is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
#

PROG="${0##*/}"
PROG_VERSION=0.1.0

AUDIT_DATA_DB=audit
AUDIT_DATA_TABLE=AuditDataRaw

RECORD_NODE_FIELD="${RECORD_NODE_FIELD:-record_node}"
RECORD_TIMESTAMP_FIELD="${RECORD_TIMESTAMP_FIELD:-record_timestamp}"
RECORD_MILLI_FIELD="${RECORD_MILLI_FIELD:-record_milli}"
RECORD_SERIAL_FIELD="${RECORD_SERIAL_FIELD:-record_serial}"
RECORD_TEXT_FIELD="${RECORD_TEXT_FIELD:-record_text}"

show_help()
{
	cat <<EOF
Usage: $PROG [options] [START] CHUNKSIZE

$PROG is a tool to export audit data from a Clickhouse database.

Options:

  -h HOST, --host=HOST       host name or IP to connect to Clickhouse
                           database with audit data;

  -u USER, --user=USER       user name to authenticate with;

  -d DB, --db=DB             database name to use (default is
                           '$AUDIT_DATA_DB');

  -t TABLE, --table=TABLE    table name (default is '$AUDIT_DATA_TABLE');

  -n NODE, --node=NODE       node name to select data for;

  -O OUTDIR, --outdir=OUTDIR    directory where to write the files;

  -w WIDTH, --digits=WIDTH    how many digits index numbers should
                            be wide; the default is 3 + width of
                            CHUNKSIZE;

  --dry-run                   print out the resulting SQL command and
                           exit;

  -r, --raw                  export raw data (record_text) column;
                           currently, the only mode that is supported;

  -c=CMP, --compress=CMP     compress the resulting data with
                           compressor CMP (gz, bz2, xz);

  -q, --quiet                don't print any info messages;

  -V,--version               print program version and exit;

  --help                     show this text and exit.


Report bugs to https://bugzilla.altlinux.org/.

EOF
}

print_version()
{
	cat <<EOF
$PROG version $PROG_VERSION
Written by: see the source for author info.

Copyright (C) 2023 BaseALT /basealt.ru/
This is free software; see the source for copying conditions.
There is NO warranty; not even for MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE.
EOF
}

show_usage()
{
	cat <<EOF
Usage: $PROG [options] [START] CHUNKSIZE

Run $PROG --help to see the help page.
EOF
    return 1
}

OPTS=`getopt -n $PROG -o h:,u:,d:,t:,n:,O:,r,V,q,c:,w: \
             -l host:,db:,table:,node:,outdir:,dry-run,raw,version,help,compress:,quiet,digits: -- "$@"` || show_usage
eval set -- "$OPTS"


host=
user=
db="$AUDIT_DATA_DB"
table="$AUDIT_DATA_TABLE"
node=
empty_node=
outdir=
raw=
dry_run=
compress=
quiet=
width=

while :; do
    case "$1" in
	-h|--host)
            shift
            host="$1"
            ;;
	-u|--user)
            shift
            user="$1"
            ;;
	-d|--db)
            shift
            db="$1"
            ;;
	-t|--table)
            shift
            table="$1"
            ;;
        -n|--node)
            shift
            node="$1"
	    if [ -z "$node" ]; then
		empty_node=1
	    fi
            ;;
	-O|--outdir)
            shift
            outdir="$1"
            ;;
	-c|--compress)
            shift
            compress="$1"
            ;;
	-w|--digits)
            shift
            width="$1"
            ;;
	-r|--raw)
	    raw=1
	    ;;
	--dry-run)
	    dry_run=1
	    ;;
	--quiet)
	    quiet=1
	    ;;
	-V|--version)
            print_version
            exit 0
            ;;
	--help)
            show_help
            exit 0
            ;;
        --)
            shift
            break
            ;;
	*)
            print_error "ERROR: Unrecognized option: %s" "$1"
            exit 1
            ;;
    esac
    shift
done

START=
CHUNKSIZE=
if [ $# -eq 1 ]; then
    START=
    CHUNKSIZE="$1"
elif [ $# -eq 2 ]; then
    START="$1"
    CHUNKSIZE="$2"
else
    show_usage
    exit 1
fi

case "$compress" in
    gz|bz2|xz)
	;;
    *)
	echo "Unsupported compression method: $compress! Use: gz | bz2 | zx." >&2
	exit 1
	;;
esac

print_info() {
    local fmt="$1"; shift
    [ "$quiet" = "1" ] || \
	printf "$fmt" "$@" >&2
}

print_info 'Using database %s.\n' "$db"
print_info 'Using table %s.\n' "$table"

if [ -z "$width" ]; then
    width="$(echo -n "$CHUNKSIZE" | wc -c)"
    width=$((width + 3))
    print_info 'Using %d digits for indexing.\n' "$width"
fi

cleanup() {
    if [ -n "${workdir:-}" ]; then
	rm -rf "$workdir"
    fi
    [ -z "${part:-}" ] || rm -f "$part"
    [ -z "${outpart:-}" ] || rm -f "$outpart"
}

workdir="$(mktemp -d --tmpdir "$PROG.XXXX")"
trap cleanup EXIT

query() {
    printf '%s\n' "$*" | \
	clickhouse-client ${host:+-h "$host"} \
			  ${db:+-d "$db"} \
			  ${user:+ -u "$user"}
}

if [ -z "$node" -a -z "$empty_node" ]; then
    print_info 'Query node names...\n'
    query "SELECT DISTINCT \`$RECORD_NODE_FIELD\` FROM \`$table\` ORDER BY \`$RECORD_NODE_FIELD\`;" >"$workdir/nodes"
else
    echo "$node" >"$workdir/nodes"
fi

find_node_last_export() {
    local node_name="$1"
    find -L ${outdir:+"$outdir"} -name "${node_name:+$node_name-}*-*.log*" | \
	sed -n -e 's,^.*/,,' -e 's/^\(.\+-\)\?\([0-9]\+\)-\([0-9]\+\)\.log\(\.\(gz\|bz2\|xz\)\)\?$/\3 \2 &/p' | sort -n | tail -1
}

while read node_name; do
    print_info 'Exporting audit data for node "%s".\n' "$node_name"

    start_from="$START"
    if [ -z "$start_from" ]; then
	last_export="$(find_node_last_export "$node_name")"
	if [ -n "$last_export" ]; then
	    last_end="$(echo "$last_export" | cut -f1 -d' ')"
	    while [ "$last_end" != "${last_end#0}" ]; do
		last_end="${last_end#0}"
	    done
	    start_from=$((last_end + 1))
	    print_info 'Found %s. Will continue from %d.\n' \
		       "$(echo "$last_export" | cut -f3 -d' ')" \
		       "$start_from"
	else
	    start_from=0
	    print_info 'No previous export files for node "%s" found in %s. Will start at %d.\n' "$node_name" "${outdir:-.}" "$start_from"
	fi
    else
	print_info 'Start at %d as specified.\n' "$start_from"
    fi

    chunks=0
    while :; do
	sql="SELECT \`$RECORD_TEXT_FIELD\` FROM \`$table\` WHERE coalesce(\`record_node\`, '') = '$node_name' ORDER BY (\`$RECORD_TIMESTAMP_FIELD\`, \`$RECORD_MILLI_FIELD\`, \`$RECORD_SERIAL_FIELD\`) LIMIT $CHUNKSIZE OFFSET $start_from"

	basename="${node_name:+$node_name-}$(printf "%0${width}d" "$start_from")"

	if [ -n "$dry_run" ]; then
	    echo "$sql" >&2
	    break
	fi

	print_info 'Query %d records for "%s" starting from %d.\n' \
		       "$CHUNKSIZE" "$node_name" "$start_from"
	part=
	if [ -z "$compress" ]; then
	    part="${outdir:+${outdir%/}/}$basename.part"
	else
	    part="$workdir/$basename.part"
	fi
	
	query "$sql" >"$part" || exit $?
	count="$(cat "$part" | wc -l)"

	if [ -n "$count" -a "$count" -gt 0 ]; then
	    print_info 'Query succeeded. %d records exported.\n' \
		       "$count"
	else
	    if [ $chunks -gt 0 ]; then
		print_info 'No more records for "%s".\n' "$node_name"
	    else
		print_info 'No records for "%s".\n' "$node_name"
	    fi
	    break
	fi

	end_at=$((start_from + count - 1))
	start_end_name="$basename-$(printf "%0${width}d" "$end_at")"
	filename="${outdir:+${outdir%/}/}$start_end_name.log"

	if [ -n "$compress" ]; then
	    filename="$filename.$compress"
	    outpart="${outdir:+${outdir%/}/}$start_end_name.part.$compress"
	    case "$compress" in
		gz)
		    print_info 'Compressing the chunk with gzip...\n'
		    cat "$part" | gzip -c --best >"$outpart"
		    ;;
		bz2)
		    print_info 'Compressing the chunk with bzip2...\n'
		    cat "$part" | bzip2 -c --best >"$outpart"
		    ;;
		xz)
		    print_info 'Compressing the chunk with xz...\n'
		    cat "$part" | xz -c --best >"$outpart"
		    ;;
		*)
		    echo "BUG! Unexpected compression method: $compress!" >&2
		    exit 2
		    ;;
	    esac
		    
	else
	    outpart="$part"
	fi

	print_info 'Renaming the chunk to %s\n' "$filename"
	mv "$outpart" "$filename"

	rm -f "$part" "$outpart"

	chunks=$((chunks + 1))
	start_from=$((end_at + 1))

	if [ "$count" -lt "$CHUNKSIZE" ]; then
	    print_info 'No more records for "%s".\n' "$node_name"
	    break;
	fi
    done
done <"$workdir/nodes"
