#!/usr/bin/env python
# Copyright 2007-2014  Pavel Rychly, Milos Jakubicek

import manatee

def print_region(region):
    for i in range(0, len(region), 2):
        print "%s" % region[i],

def print_result(kw_beg, kw_end, refattr, corp, attrs, structs, lrCtxSize,
                 attrdelim, posdelim, kwicdelim, colls):
    kwicbeg, kwicend = kwicdelim.split(",")
    if refattr:
        try:
            ref = corp.get_attr (refattr)
            print '#%i,%s' % (kw_beg, ref.pos2str (kw_beg)),
        except manatee.AttrNotFound:
            ref = corp.get_struct (refattr)
            print '#%i,%s' % (kw_beg, ref.num_at_pos (kw_beg)),
    else:
        print '#%i' % kw_beg,

    corpRegion = manatee.CorpRegion(corp, attrs, structs)
    collposs = {}
    c = 0
    while c < len(colls):
        collposs [colls[c+1]] = colls[c]
        c += 2
    reg_beg = kw_beg - lrCtxSize[0]
    collposs [0] = kwicbeg
    rel_end = kw_end - kw_beg
    if rel_end == 0: # structure
        collposs [0] = kwicbeg + kwicend
    else:
        collposs [rel_end] = kwicend
    # XXX collocations should be handled in CorpRegion so as not to interfere
    # with structures
    for c in sorted(collposs.keys()):
        reg = corpRegion.region(reg_beg, kw_beg + c, posdelim, attrdelim)
        print_region (reg)
        if type(collposs[c]) == type(""):
            print collposs[c],
            reg_beg = kw_beg + c
        else:
            print "%s%d:" % (kwicbeg, collposs[c]),
            reg = corpRegion.region(kw_beg + c, kw_beg + c + 1, posdelim, attrdelim)
            print_region (reg)
            print kwicend,
            reg_beg = kw_beg + c + 1
    last = corpRegion.region(reg_beg, kw_end+lrCtxSize[1], posdelim, attrdelim)
    print_region(last)
    print

def corp_query (corpname, query, refattr, ctxSize, hardcut, attrs, structs,
                defattr, gdex_conf, attrdelim, posdelim, kwicdelim):

    if attrdelim[0] == "\\":
        attrdelim = attrdelim.decode("string_escape")
    if posdelim[0] == "\\":
        posdelim = posdelim.decode("string_escape")
    try:
        ctxSize = int(ctxSize)
        lrCtxSize = (ctxSize, ctxSize)
    except:
        lrCtxSize = map(int, ctxSize.split(","))

    corp = manatee.Corpus (corpname)
    corp.set_default_attr(defattr)
    result = corp.eval_query (query)
    colls = manatee.IntVector()
    result.collocs (colls)
    numofcolls = len (colls) / 2
    if gdex_conf:
        try:
            import gdex
        except:
            import gdex_old as gdex
        result.thisown = False
        conc = manatee.Concordance (corp, result, numofcolls)
        conc.sync()
        if gdex_conf == "-":
            gdex_conf = None
        try:
            G = gdex.GDEX (corp, gdex_conf)
        except:
            G = gdex.GDEX (corp) # old GDEX
        G.entryConc (conc) # load input
        if hardcut == -1:
            hardcut = 100
        G_sort = G.best_k (hardcut, hardcut) # compute and return sorted list
        for _, concnum in G_sort:
            colls = []
            for i in xrange(1, conc.numofcolls() + 1):
                collpos = conc.coll_beg_at (i, concnum)
                if collpos != -1:
                    colls.append (i)
                    colls.append (collpos - conc.beg_at (concnum))
            print_result(conc.beg_at(concnum), conc.end_at(concnum), refattr,
                         corp, attrs, structs, lrCtxSize, attrdelim, posdelim,
                         kwicdelim, colls)
    else:
        while (hardcut > 0 or hardcut == -1) and not result.end():
            colls = manatee.IntVector()
            result.collocs (colls)
            colls = list(colls)
            i = 0
            while i < len(colls):
                if colls[i] >= 100 or colls[i] <= -100:
                    colls.pop(i)
                    colls.pop(i)
                else:
                    i += 2
            print_result(result.peek_beg(), result.peek_end(), refattr, corp,
                         attrs, structs, lrCtxSize, attrdelim, posdelim,
                         kwicdelim, colls)
            result.next()
            if hardcut != -1:
                hardcut -= 1


if __name__ == '__main__':
    from sys import argv, path
    from distutils.sysconfig import get_python_lib;
    args = {"-r" : None, "-c" : 15, "-h" : -1, "-a": "DEFAULTATTR",
            "-s" : "", "-k" : "<,>", "-g" : None, "-d": "DEFAULTATTR",
            "-m" : get_python_lib() + "/bonito/", "-e": "/", "-l": " "}
    if argv[2:]:
        args["-d"] = manatee.Corpus (argv[1]).get_conf ("DEFAULTATTR")
        args["-a"] = args["-d"]
        import getopt
        opts, _ = getopt.getopt(argv[3:], "r:c:h:a:s:d:g:m:e:l:k:")
        args.update(dict(opts))
        if args["-k"].count(",") > 1:
            raise Exception("KWIC delimiter must not contain a comma")
        path.insert(0, args["-m"])
        corp_query(argv[1], argv[2], args["-r"], args["-c"], int(args["-h"]),
                   args["-a"], args["-s"], args["-d"], args["-g"], args["-e"],
                   args["-l"], args["-k"])
    else:
        print '''Usage: corpquery.py CORPUSNAME QUERY [ OPTIONS ]\n
Options:\n
  -r ATTR               reference attribute
                        (default: %s)
  -c LEFT,RIGHT | BOTH  left and right or both context length
                        (default: %d)
  -h LIMIT              maximum number of results
                        (default: %d)
  -a ATTR1,ATTR2,...    comma separated list of attributes to be shown
                        (default: %s)
  -s STR1,STR2...       comma separated list of structures to be shown
                        (use struct.attr or struct.* to show structure attributes;
                        (default: %s)
  -d DEFAULT_ATTR       default positional attribute
                        (default: %s)
  -g GDEX_CONF          use GDEX with a given GDEX_CONF configuration file
                        (default: %s; use - for default configuration)
                        use -h to set the result size (default: 100)
  -m GDEX_MODULE_DIR    GDEX module path (directory with gdex.py or gdex_old.py)
                        (default: %s)
  -e DELIMITER          positional attribute delimiter (default: %s)
  -l DELIMITER          positions delimiter (default: %s)
  -k BEGIN,END          mark beginning/end of KWIC by BEGIN/END (default: %s)
''' % (args["-r"], args["-c"], args["-h"], args["-a"], args["-s"], args["-d"], args["-g"],
        args["-m"], args["-e"], args["-l"], args["-k"])

# vim: ts=4 sw=4 sta et sts=4 si tw=80:
