#! /usr/bin/python
#  Copyright (c) 2008-2015  Pavel Rychly, Milos Jakubicek, Vit Baisa

import sys
import os
import manatee

def create_subcorpus (corp, subcdir, subcname, strucname, query):
    path = subcdir + '/' + subcname + '.subc'
    sys.stdout.write("Subcorpus %s..." % subcname)
    if strucname == "-CQL-":
        r = manatee.create_subcorpus (path, corp.eval_query (query))
    else:
        r = manatee.create_subcorpus (path, corp, strucname, query)
    if not r:
        sys.stdout.write("warning: empty subcorpus for query '%s %s'\n"
                         % (strucname, query))
        return None
    sys.stdout.write("done\n")
    return path

def process_def_file (corpname, subcdir, infile):
    sys.stdout.write("Compiling subcorpora...\n")
    infile = open(infile)
    corp = manatee.Corpus(corpname)
    if not os.path.isdir(subcdir):
        os.mkdir(subcdir)
    frqattrs = []
    attrlist = corp.get_conf("ATTRLIST")
    while 1:
        line = infile.readline()
        if not line:
            break
        if line.startswith('#') or not line.strip():
            # ignore comments and empty lines
            continue
        if line.startswith('*FREQLISTATTRS'):
            for attr in line.split()[1:]:
                if attr not in attrlist:
                    print "'%s' is listed in FREQLISTATTRS but is not a valid attribute, skipping" % attr
                else:
                    frqattrs.append (attr)
        elif line.startswith('='):
            if not frqattrs:
                print 'No FREQLISTATTRS given -- exiting.'
                break
            subcname = line[1:].strip()
            structname = infile.readline().strip()
            query = infile.readline().strip()
            if query.startswith('Q:'): # subcorpus from concordance, skip
                continue
            path = create_subcorpus (corp, subcdir, subcname, structname, query)
            if path and compile_stats:
                for attr in frqattrs:
                    for stat in compile_stats:
                        os.system ("mkstats %s %s %s %s"
                                    % (corpname, attr, stat, path))
        else:
            print 'Subcorpus definition file format error: "%s"' % line.strip()

if __name__ == "__main__":
    compile_stats = []
    if len(sys.argv) > 2 and sys.argv[1] == "-s":
        compile_stats = sys.argv[2].split(",")
        sys.argv = sys.argv[2:]
    if sys.argv[3:]: # process subcorpus
        process_def_file (*sys.argv[1:])
    else:
        stat_names = ["frq", "arf", "docf", "fid", "aldf"]
        print 'Usage: mksubc [-s] CORPUS SUBCORP_DIR SUBCORP_DEF_FILE'
        print 'Creates subcorpora (*.subc) files according to subcorpus'
        print 'definition file (see doc/subcdef.txt)\n'
        print '-s STATS          compute statistics for all FREQLISTATTRS,'
        print '                  STATS is a comma separated list of statistics,'
        print '                  which might be one of ' + ", ".join(stat_names)
        print 'CORPUS            corpus configuration file'
        print 'SUBCORP_DIR       directory for the .subc to be stored'
        print 'SUBCORP_DEF_FILE  subcorpus definition file'

# vim: ts=4 sw=4 sta et sts=4 si cindent tw=80:
