#!/usr/bin/env python
# vim: set ts=8 sw=4 sts=4 et ai tw=79:

# What to parse. The first_data allows us to skip over the headers without
# needing to understand any of them.
#source, first_data = '/var/lib/aspell/nl.rws', '\x0e\x0e\x0aabacadabra\x00'
source, first_data = '/var/lib/aspell/nl.rws', 'abracadabra\x00'

# I attempted to run this on the sv.rws too, but that did not work out well..
# there are some complicating factors involved. Use this for the dutch list
# only.
#source = first_data = ('/var/lib/aspell/sv.rws',
#                       '\x0e\x16\x0cschaktmassor\x00\x0e\x03&AL\x00')


def aspellreader(file, skip_until):
    assert 1 <= len(skip_until) <= 4096
    buf = ''

    # Skip past the header.
    while True:
        more = file.read(4096)
        if not more:
            return  # fail!
        buf += more

        if len(buf) >= 8192:
            buf = buf[4096:]
        if skip_until in buf:
            break
    # Where?
    #index = buf.find(skip_until)
    #buf = buf[(index + len(skip_until)):]
    index = buf.find(skip_until) - 3
    buf = buf[index:]

    # Start reading the words one at a time.
    eof = False
    while True:
        # Words don't exceed 100 chars.. this is safe-ish.
        if len(buf) < 100 and not eof:
            more = file.read(4096)
            if not more:
                eof = True
            else:
                buf += more

        if len(buf) < 4:
            break
        inf1, inf2, size = ord(buf[0:1]), ord(buf[1:2]), ord(buf[2:3])
        if inf1 & 0b01000000:
            break  # done?

        if len(buf) < 4 + size:
            break
        word = buf[3:(3 + size)]
        yield word, inf1, inf2

        if buf[(3 + size)] == '\x00':
            buf = buf[(4 + size):]
        else:
            assert False, (repr(buf), repr(word), inf1, inf2)


def normalwords(reader):
    for word, inf1, inf2 in reader:
        # iets met afwijkende stam...
        if (inf2 - 4) != len(word):
            continue

        if inf1 in (7, 23):
            # all-caps afkorting ("OZB")
            pass
        elif inf1 in (2, 6, 10, 18, 22):
            # koppeltekens en trema's
            pass
        elif inf1 in (0, 1, 4, 5, 17, 20, 21):
            # plaatsnaam, eigennaam ("Bibi")
            pass
        else:
            assert '-' not in word
            assert not any(i in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                           for i in word)
            yield word, inf1, inf2


def limitedsizewords(reader, min, max):
    for word, inf1, inf2 in reader:
        if not (min <= len(word) <= max):
            continue

        yield word, inf1, inf2


dictionary = aspellreader(open(source, 'rb'), first_data)
dictionary = normalwords(dictionary)
dictionary = limitedsizewords(dictionary, 4, 7)
try:
    for i, (word, inf1, inf2) in enumerate(dictionary):
        print word
except StopIteration:
    pass