#! /usr/bin/env python
#  Copyright (c) 2014  Pavel Rychly

"""Decode a corpus -- creates a vertical file from encoded data
"""

import sys
import manatee

import bisect

class PriorityQueue:
    def __init__ (self):
        self.queue = []
        
    def put(self, item):
        if item:
            bisect.insort(self.queue, item)

    def get(self):
        return self.queue.pop(0)

    def top(self):
        return self.queue[0]

    def size(self):
        return len(self.queue)

class StructLines:
    def __init__ (self, corp, structname, starttoken):
        self.name = structname
        self.struct = corp.get_struct(structname)
        self.attrs = []
        sa_list = corp.get_conf('STRUCTATTRLIST')
        if len(sa_list) > 0:
            sa_list = sa_list.split(',')
            for sa in sa_list:
                s,a = sa.split('.')
                if s != structname:
                    continue
                self.attrs.append((a,self.struct.get_attr(a)))
        self.currnum = self.struct.num_next_pos(starttoken)
        if self.currnum < 0:
            self.currnum = 0
        #print 'StructLines', self.name, self.currnum
        self.inside = False

    def next_event (self):
        if self.currnum >= self.struct.size():
            return None
        if self.inside:
            tok = self.struct.end(self.currnum)
            self.inside = False
            self.currnum += 1
            return (tok, '</%s>' % self.name, self)
        else:
            tok = self.struct.beg(self.currnum)
            attrs = ''.join([' %s="%s"' % 
                             (n, a.pos2str(self.currnum).replace('"', '\\"'))
                             for n,a in self.attrs])
            if self.struct.beg(self.currnum) == self.struct.end(self.currnum):
                # empty tag
                close = '/'
                self.currnum += 1
            else:
                close = ''
                self.inside = True
            return (tok, '<%s%s%s>' % (self.name, attrs, close), self)


def decode_corpus (corp, attrs=None, structs=None, fromtok=None, totok=None):
    if attrs is None:
        attrs = corp.get_conf('ATTRLIST')
    if structs is None:
        structs = corp.get_conf('STRUCTLIST')
    if fromtok is None:
        fromtok = 0
    if totok is None:
        totok = corp.size()
    attrs = [corp.get_attr(a).textat(fromtok) for a in attrs.split(',')
             if not corp.get_conf(a+'.DYNAMIC')]
    structs = [StructLines(corp, s, fromtok) for s in structs.split(',')]
    nextstruct = PriorityQueue()
    for s in structs:
        nextstruct.put(s.next_event())
    
    for tok in xrange (fromtok, totok):
        while nextstruct.top()[0] <= tok:
            _, line, s = nextstruct.get()
            print line
            nextstruct.put (s.next_event())
            
        print '\t'.join ([a.next() for a in attrs])
    while nextstruct.size() and nextstruct.top()[0] == totok:
        _, line, s = nextstruct.get()
        print line
            
if __name__ == "__main__":
    if sys.argv[1:]:
        decode_corpus (manatee.Corpus(sys.argv[1]))
    else:
        print "Usage: %s CORPUS" % sys.argv[0]
        print "Decodes a corpus -- creates a vertical file from encoded data"
