#!/bin/bash -eu
# Copyright 2013-2019  Milos Jakubicek
set -o pipefail

if [ $# = 1 ]; then
    NO_ALIGNSIZES=0
elif [ $# = 2 ]; then
    if [ "$2" == "--no-alignsizes" ]; then
        NO_ALIGNSIZES=1
    else
        echo "Usage: $0 CORPUS [--no-alignsizes]"
        exit 1;
    fi
else
	echo "Usage: $0 CORPUS [--no-alignsizes]"
	exit 1;
fi

CORPUS=$1
PATH_="`corpinfo -p "$CORPUS"`"
DOCSTRUCTURE="`corpinfo -g DOCSTRUCTURE "$CORPUS"`"
NONWORDRE="`corpinfo -g NONWORDRE "$CORPUS"`"
VIRTUAL="`corpinfo -g VIRTUAL "$CORPUS" 2>/dev/null`"
COMPILED_SIZES_FILE="$PATH_/sizes"
WORDCOUNT=0

if [ "$VIRTUAL" ]; then
	RECOMPILE=0
	SEGMENTS=
	while read LINE; do
		if echo $LINE | egrep -q '^#|^[[:space:]]*$'; then
			continue;
		elif echo $LINE | grep -q '^='; then
			LASTCORP=${LINE#=}
			SEGMENTS="$LASTCORP $SEGMENTS"
		else
			FROM=`echo $LINE | cut -f1 -d',' | tr -d '[[:space:]]'`
			TO=`echo $LINE | cut -f2 -d',' | tr -d '[[:space:]]'`
			if [[ $FROM -ne 0 || ( $TO != '$' && $TO -ne `corpinfo -s $LASTCORP` ) ]]; then
				echo "One of the virtual corpus segments ($LASTCORP) is not full corpus => must recompile sizes from scratch"
				RECOMPILE=1;
				break;
			fi
		fi
	done < $VIRTUAL
	if [ $RECOMPILE -ne 1 ]; then
		echo "Computing sizes from segments of the virtual corpus"
		SIZEFILES=
		for SEG in $SEGMENTS; do
			SEGPATH=`corpinfo -g PATH $SEG`
			if [ ! -f $SEGPATH/sizes ]; then
				echo "Compiling sizes for corpus segment: $SEG"
				mksizes $SEG
			fi
			SIZEFILES="$SEGPATH/sizes $SIZEFILES"
		done
		paste $SIZEFILES | awk '{sum = 0; for (i = 2; i <= NF; i += 2) {sum += $i;} print $1, sum }' > $COMPILED_SIZES_FILE
		echo "Sizes compiled"
		exit 0;
	fi
fi

# not VIRTUAL or VIRTUAL with at least one subcorpus
if [ $WORDCOUNT -ne 0 ]; then
	: # we have wordcount from VIRTUAL corpus
else
    ATTR=`corpinfo -g word.TYPE "$CORPUS" >&/dev/null && echo word || corpinfo -g DEFAULTATTR "$CORPUS"`
    WORDCOUNT=`mkwc "$CORPUS" "$ATTR"`
fi
TOKENCOUNT=`corpinfo -s "$CORPUS"`
echo -e "tokencount $TOKENCOUNT\nwordcount $WORDCOUNT" > $COMPILED_SIZES_FILE
DOCCOUNT=`corpinfo -t "$DOCSTRUCTURE" "$CORPUS" 2>/dev/null || echo 0`
[ -z $DOCCOUNT ] && DOCCOUNT=0
PARCOUNT=`corpinfo -t p "$CORPUS" 2>/dev/null || echo 0`
[ -z $PARCOUNT ] && PARCOUNT=0
SENTCOUNT=`corpinfo -t s "$CORPUS" 2>/dev/null || echo 0`
[ -z $SENTCOUNT ] && SENTCOUNT=0
echo -e "doccount $DOCCOUNT\nparcount $PARCOUNT\nsentcount $SENTCOUNT" >> $COMPILED_SIZES_FILE
ALIGNED="`corpinfo -g ALIGNED "$CORPUS"`"
if [ $NO_ALIGNSIZES = 1 ]; then
    echo "Alignment size computation disabled; skipping..."
elif [ -z "$ALIGNED" ]; then
    echo "No parallel corpora specified in ALIGNED; skipping alignment size computations..."
else
    ALIGNS=(`echo $ALIGNED | sed 's/,/ /g'`)
    for I in `seq 0 ${#ALIGNS[*]} | head -n -1`; do
        echo "Computing alignment size for corpus ${ALIGNS[$I]}..."
        lsalsize $CORPUS ${ALIGNS[$I]} >> $COMPILED_SIZES_FILE
    done
fi
echo "Sizes compiled"

# vim: ts=4 sw=4 sta et sts=4 si tw=80:
