#!/usr/bin/env python3
# Copyright 2007-2020  Pavel Rychly

import manatee
import sys

if not sys.argv[3:]:
    print('usage: lsslex.py [-s] CORPNAME STRUCTNAME STRUCTATTR')
    print('       lists number of tokens for all structure attribute values')
    print('       with -s output is stored in binary in CORPUSPATH/STRUCTNAME.STRUCTATTR.token')
    print('example: lsslex.py bnc bncdoc alltyp')
    sys.exit(1)

tokenfile = None
if sys.argv[1] == "-s":
    sys.argv = sys.argv[1:]
    import array
    tokenfile = array.array("Q")

corp = manatee.Corpus (sys.argv[1])
struct = corp.get_struct (sys.argv[2])
attrname = sys.argv[3]
attr = struct.get_attr(attrname)

for id in range(attr.id_range()):
    r = struct.attr_val(attrname, id)
    t = r.peek_end() - r.peek_beg()
    while r.next():
        t += r.peek_end() - r.peek_beg()
    if tokenfile is None:
        print(t, attr.id2str(id), sep='\t')
    else:
        tokenfile.append(t)

if tokenfile:
    tokenfile.tofile(open("%s/%s.%s.token" % (corp.get_conf("PATH"), sys.argv[2], sys.argv[3]), "wb"))
