# Exclusion dictionaries for Norwegian Nynorsk
# ────────────────────────────────────────────
#
# [Note: This file is runnable using ‘sh’.]
# 
# The file ‘klammeformer.dat’ contains a list of all unoffical
# forms (‘klammeformer’ and ‘unormerte ord/former’) of 
# Norwegian Nynorsk words listed in Norsk ordbank 
# (http://www.edd.uio.no/prosjekt/ordbanken/). It is generated
# from ‘fullform_nn.txt’ using these commands:

grep -v '^*' fullform_nn.txt > alle.txt
grep -Fv "unormert" alle.txt |
grep -Fv " klammeform" > hovudformer.txt

cut -f3 -d'	' alle.txt | sort | uniq > alle.dat
cut -f3 -d'	' hovudformer.txt | sort | uniq > hovudformer.dat
comm -3 alle.dat hovudformer.dat > klammeformer.dat

rm -f alle.* hovudformer.*


# The file ‘e-infinitiv.dat’ contains a list of all infinitives
# ending in -e where there are no other word forms with the 
# exact same spelling. For example, it contains the word ‘lagre’
# (should be spelled ‘lagra’ according to our translation guidelines), 
# but not the word ‘opne’, as ‘opne’ is also used as an adjective,
# for example in ‘fleire opne program’. The file is generated from
# ‘fullform_nn.txt’ using these commands:

awk -F'\t' '
{
  form=substr($4,1,8)
  bokstav=substr($3,length($3),length($3))
  stamme=substr($3,1,length($3)-1)
  if( (form=="verb inf" || form=="verb imp") && bokstav=="a" && $2 == stamme "e")
    print $2
}' fullform_nn.txt | sort -u > ea-inf.txt
grep -v '	verb i\(nf\|mp\)' fullform_nn.txt | cut -f3 -d'	' | grep e$ | sort -u > e-ord.txt
comm -23 ea-inf.txt e-ord.txt > e-infinitiv.dat
rm -f a-inf.txt ea-inf.txt e-ord.txt
